# -----------------------------
# Lightweight semantic retrieval pipeline
# -----------------------------

import os
from glob import glob
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
from langchain_community.document_loaders import PyMuPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
import os
import io
import warnings
from contextlib import redirect_stdout, redirect_stderr

from huggingface_hub.utils import disable_progress_bars
from transformers.utils import logging as hf_logging

import warnings
warnings.filterwarnings("ignore")

os.environ["TOKENIZERS_PARALLELISM"] = "false"
warnings.filterwarnings("ignore")

disable_progress_bars()
hf_logging.set_verbosity_error()

os.environ["TOKENIZERS_PARALLELISM"] = "false"
warnings.filterwarnings("ignore")

# 1. Locate PDFs
DOC_FOLDER = r"C:\Users\13015\Desktop\Final_Project"
pdf_files = glob(os.path.join(DOC_FOLDER, "*.pdf"))

print("PDF files found:", len(pdf_files))
for file in pdf_files:
    print(file)

if len(pdf_files) == 0:
    raise ValueError("No PDF files were found. Check DOC_FOLDER path.")

# 2. Load documents
all_docs = []
for pdf_path in pdf_files:
    loader = PyMuPDFLoader(pdf_path)
    docs = loader.load()
    all_docs.extend(docs)

print("\nTotal document pages loaded:", len(all_docs))

# 3. Chunk documents
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1200,
    chunk_overlap=100
)

chunked_docs = text_splitter.split_documents(all_docs)
print("Total chunks created:", len(chunked_docs))

if len(chunked_docs) == 0:
    raise ValueError("Chunking produced zero chunks.")

# 4. Extract plain text from chunks
chunk_texts = [doc.page_content for doc in chunked_docs]

# 5. Load embedding model
embedder = SentenceTransformer("all-MiniLM-L6-v2")
print("Embedding model loaded successfully.")

# 6. Embed chunks in batches
batch_size = 32
all_embeddings = []

for i in range(0, len(chunk_texts), batch_size):
    batch = chunk_texts[i:i + batch_size]
    batch_embeddings = embedder.encode(batch, show_progress_bar=False)
    all_embeddings.append(batch_embeddings)

chunk_embeddings = np.vstack(all_embeddings)
print("Chunk embeddings shape:", chunk_embeddings.shape)

c:\Users\13015\AppData\Local\Programs\Python\Python311\Lib\site-packages\tqdm\auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html
  from .autonotebook import tqdm as notebook_tqdm

PDF files found: 5
C:\Users\13015\Desktop\Final_Project\Coal2025.pdf
C:\Users\13015\Desktop\Final_Project\Electricity2026.pdf
C:\Users\13015\Desktop\Final_Project\Gas2025.pdf
C:\Users\13015\Desktop\Final_Project\Oil2025.pdf
C:\Users\13015\Desktop\Final_Project\Renewables2025.pdf

Total document pages loaded: 868
Total chunks created: 2074

Warning: You are sending unauthenticated requests to the HF Hub. Please set a HF_TOKEN to enable higher rate limits and faster downloads.
Loading weights: 100%|██████████| 103/103 [00:00<00:00, 12909.41it/s]

Embedding model loaded successfully.
Chunk embeddings shape: (2074, 384)

def retrieve_top_k(query, k=4):
    query_embedding = embedder.encode([query], show_progress_bar=False)
    sims = cosine_similarity(query_embedding, chunk_embeddings)[0]
    top_indices = np.argsort(sims)[-k:][::-1]
    
    results = []
    for idx in top_indices:
        results.append({
            "index": idx,
            "score": sims[idx],
            "metadata": chunked_docs[idx].metadata,
            "content": chunked_docs[idx].page_content
        })
    return results

test_query = "How is the rapid global expansion of artificial intelligence data centres impacting overall electricity demand and straining existing power grid infrastructure?"

results = retrieve_top_k(test_query, k=4)

print("Number of retrieved chunks:", len(results))

for i, res in enumerate(results, start=1):
    print(f"\n--- Retrieved Chunk {i} ---")
    print("Score:", round(res["score"], 4))
    print("Metadata:", res["metadata"])
    print("Content preview:")
    print(res["content"][:700])

Number of retrieved chunks: 4

--- Retrieved Chunk 1 ---
Score: 0.7571
Metadata: {'producer': 'Adobe PDF Library 25.1.159', 'creator': 'Acrobat PDFMaker 25 for Word', 'creationdate': '2026-02-06T06:16:44+01:00', 'source': 'C:\\Users\\13015\\Desktop\\Final_Project\\Electricity2026.pdf', 'file_path': 'C:\\Users\\13015\\Desktop\\Final_Project\\Electricity2026.pdf', 'total_pages': 225, 'format': 'PDF 1.7', 'title': 'Electricity 2026', 'author': 'IEA - International Energy Agency', 'subject': 'Electricity 2026', 'keywords': 'Electricity 2026', 'moddate': '2026-02-13T10:15:00+01:00', 'trapped': '', 'modDate': "D:20260213101500+01'00'", 'creationDate': "D:20260206061644+01'00'", 'page': 7}
Content preview:
substantially by 2030, driven by robust economic growth and rapidly rising demand 
for air conditioning, which is set to boost both annual consumption and peak loads. 
Electricity demand growth in advanced economies is accelerating again 
after 15 years of stagnation. This resurgence signals a new era in which 
electricity is a major energy input to some of the most dynamic drivers of global 
economies, such as artificial intelligence (AI), data centres and advanced 
manufacturing. In 2025, advanced economies accounted for almost 20% of global 
electricity demand growth, up from 17% in 2024. We expect this share to remain

--- Retrieved Chunk 2 ---
Score: 0.7113
Metadata: {'producer': 'Adobe PDF Library 25.1.159', 'creator': 'Acrobat PDFMaker 25 for Word', 'creationdate': '2026-02-06T06:16:44+01:00', 'source': 'C:\\Users\\13015\\Desktop\\Final_Project\\Electricity2026.pdf', 'file_path': 'C:\\Users\\13015\\Desktop\\Final_Project\\Electricity2026.pdf', 'total_pages': 225, 'format': 'PDF 1.7', 'title': 'Electricity 2026', 'author': 'IEA - International Energy Agency', 'subject': 'Electricity 2026', 'keywords': 'Electricity 2026', 'moddate': '2026-02-13T10:15:00+01:00', 'trapped': '', 'modDate': "D:20260213101500+01'00'", 'creationDate': "D:20260206061644+01'00'", 'page': 12}
Content preview:
While emerging economies continue to be the main pillars of growth in electricity 
use, demand in advanced economies is now rising again after a 15-year period of 
stagnation. The resurgence signals a new era in which electricity is a major energy 
input to some of the most dynamic drivers of global economies, such as artificial 
intelligence (AI), data centres, technological innovations, and the “electrification of 
everything”. As a result, both total and per capita electricity consumption will reach 
new record highs in many regions of the world through 2030.  
This chapter presents our global electricity demand forecast and a detailed 
overview of emerging trends in major economies, whic

--- Retrieved Chunk 3 ---
Score: 0.7077
Metadata: {'producer': 'Adobe PDF Library 25.1.159', 'creator': 'Acrobat PDFMaker 25 for Word', 'creationdate': '2026-02-06T06:16:44+01:00', 'source': 'C:\\Users\\13015\\Desktop\\Final_Project\\Electricity2026.pdf', 'file_path': 'C:\\Users\\13015\\Desktop\\Final_Project\\Electricity2026.pdf', 'total_pages': 225, 'format': 'PDF 1.7', 'title': 'Electricity 2026', 'author': 'IEA - International Energy Agency', 'subject': 'Electricity 2026', 'keywords': 'Electricity 2026', 'moddate': '2026-02-13T10:15:00+01:00', 'trapped': '', 'modDate': "D:20260213101500+01'00'", 'creationDate': "D:20260206061644+01'00'", 'page': 2}
Content preview:
Electricity 2026 
Abstract 
PAGE | 3  
IEA. CC BY 4.0. 
Abstract 
Global power demand growth continues to rise rapidly as the Age of Electricity 
gathers pace, supported by the increasing electrification of industry, 
transportation, and the buildings sectors. Growing consumption is also coming 
from some of the most dynamic segments of global economies, such as artificial 
intelligence (AI), data centres, and evolving technological innovations.  
Against this backdrop, Electricity 2026 – the IEA’s annual report on global 
electricity systems and markets – provides in-depth analysis of the recent trends 
and policy developments underpinning this new era. It includes forecasts for 
electricit

--- Retrieved Chunk 4 ---
Score: 0.6842
Metadata: {'producer': 'Adobe PDF Library 25.1.159', 'creator': 'Acrobat PDFMaker 25 for Word', 'creationdate': '2026-02-06T06:16:44+01:00', 'source': 'C:\\Users\\13015\\Desktop\\Final_Project\\Electricity2026.pdf', 'file_path': 'C:\\Users\\13015\\Desktop\\Final_Project\\Electricity2026.pdf', 'total_pages': 225, 'format': 'PDF 1.7', 'title': 'Electricity 2026', 'author': 'IEA - International Energy Agency', 'subject': 'Electricity 2026', 'keywords': 'Electricity 2026', 'moddate': '2026-02-13T10:15:00+01:00', 'trapped': '', 'modDate': "D:20260213101500+01'00'", 'creationDate': "D:20260206061644+01'00'", 'page': 61}
Content preview:
Electricity 2026 
Grids 
PAGE | 62  
IEA. CC BY 4.0. 
centre expansion and keeping some capacity available for other types of industrial 
users.  
The UK government has introduced the AI Growth Zones regulatory package to 
accommodate rising grid connection requests from large ‘AI data centres’ of 
100-500 MW capacity. Under this framework, AI data centres are prioritised for 
grid access and allowed to reserve certain physical connection points, if they are 
considered “strategically important.” The package aims to accelerate the 
interconnection of viable AI data centres by filtering out speculative requests, while 
enabling project developers to build their own grid infrastructure, includ

# ------------------------------------------------
# Standard libraries
# ------------------------------------------------
import os
from glob import glob
import warnings

# ------------------------------------------------
# Data handling
# ------------------------------------------------
import pandas as pd
import numpy as np

# ------------------------------------------------
# Local LLM
# ------------------------------------------------
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# ------------------------------------------------
# Document loading and text splitting
# ------------------------------------------------
from langchain_community.document_loaders import PyMuPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter

# ------------------------------------------------
# Embeddings and vector database
# ------------------------------------------------
from langchain_community.embeddings.sentence_transformer import SentenceTransformerEmbeddings
from langchain_community.vectorstores import Chroma

# ------------------------------------------------
# Evaluation
# ------------------------------------------------
from ragas import evaluate
from ragas.metrics import (
    Faithfulness,
    AnswerRelevancy,
    LLMContextPrecisionWithoutReference,
)
from datasets import Dataset

warnings.filterwarnings("ignore")

DOC_FOLDER = r"C:\Users\13015\Desktop\Final_Project"

pdf_files = glob(os.path.join(DOC_FOLDER, "*.pdf"))

print("PDF files found:", len(pdf_files))
for file in pdf_files:
    print(file)

PDF files found: 5
C:\Users\13015\Desktop\Final_Project\Coal2025.pdf
C:\Users\13015\Desktop\Final_Project\Electricity2026.pdf
C:\Users\13015\Desktop\Final_Project\Gas2025.pdf
C:\Users\13015\Desktop\Final_Project\Oil2025.pdf
C:\Users\13015\Desktop\Final_Project\Renewables2025.pdf

queries = [
    "How is the rapid global expansion of artificial intelligence data centres impacting overall electricity demand and straining existing power grid infrastructure?",
    
    "How is the unprecedented wave of new US liquefied natural gas (LNG) export capacity expected to impact natural gas affordability and spur additional demand in price-sensitive Asian markets by 2030?",
    
    "How are the surge in US electricity demand and the 2025 federal emergency policy interventions collectively affecting the retirement schedules, capacity planning, and generation output of domestic coal-fired power plants?",
    
    "How are the increasing frequency of negative wholesale electricity prices and the regulatory shift towards two-sided Contracts for Difference (CfDs) in Europe altering the revenue expectations and financial agility of developers investing in utility-scale solar PV?",
    
    "How do the tax credit modifications under the US 'One Big Beautiful Bill Act' (OBBBA) affect the investment economics of using domestic versus imported feedstocks for Sustainable Aviation Fuel (SAF), and what cascading impact will this biofuel transition have on the capacity rationalisation of traditional US West Coast refineries?"
]

print("Total queries:", len(queries))

Total queries: 5

from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

MODEL_NAME = "google/flan-t5-small"

buffer = io.StringIO()
with redirect_stdout(buffer), redirect_stderr(buffer):
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
    model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)

print("Base LLM loaded successfully.")

Base LLM loaded successfully.

def get_base_llm_response(query, max_new_tokens=200):
    prompt = f"Answer the following question clearly and concisely:\n\n{query}"
    
    inputs = tokenizer(
        prompt,
        return_tensors="pt",
        truncation=True,
        max_length=512
    )
    
    outputs = model.generate(
        **inputs,
        max_new_tokens=max_new_tokens,
        do_sample=True,
        temperature=0.7
    )
    
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response

test_response = get_base_llm_response(queries[0])
print(test_response)

Increasing the frequency of information centres in the world is the biggest source of escalating power demand

all_docs = []

for pdf_path in pdf_files:
    loader = PyMuPDFLoader(pdf_path)
    docs = loader.load()
    all_docs.extend(docs)

print("Total document pages loaded:", len(all_docs))

Total document pages loaded: 868

print("Sample metadata:")
print(all_docs[0].metadata)

print("\nSample page content preview:")
print(all_docs[0].page_content[:1000])

Sample metadata:
{'producer': 'Adobe PDF Library 25.1.5', 'creator': 'Acrobat PDFMaker 25 for Word', 'creationdate': '2025-12-16T18:01:19+01:00', 'source': 'C:\\Users\\13015\\Desktop\\Final_Project\\Coal2025.pdf', 'file_path': 'C:\\Users\\13015\\Desktop\\Final_Project\\Coal2025.pdf', 'total_pages': 128, 'format': 'PDF 1.7', 'title': 'Coal 2025', 'author': 'IEA - International Energy Agency', 'subject': 'Coal 2025', 'keywords': 'Coal 2025', 'moddate': '2025-12-17T09:37:48+01:00', 'trapped': '', 'modDate': "D:20251217093748+01'00'", 'creationDate': "D:20251216180119+01'00'", 'page': 0}

Sample page content preview:
Coal
2025
Analysis and forecast to 2030

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200
)

chunked_docs = text_splitter.split_documents(all_docs)

print("Total chunks created:", len(chunked_docs))

Total chunks created: 2626

for i in range(3):
    print(f"\n--- Chunk {i+1} ---")
    print("Metadata:", chunked_docs[i].metadata)
    print("Content preview:", chunked_docs[i].page_content[:500])

--- Chunk 1 ---
Metadata: {'producer': 'Adobe PDF Library 25.1.5', 'creator': 'Acrobat PDFMaker 25 for Word', 'creationdate': '2025-12-16T18:01:19+01:00', 'source': 'C:\\Users\\13015\\Desktop\\Final_Project\\Coal2025.pdf', 'file_path': 'C:\\Users\\13015\\Desktop\\Final_Project\\Coal2025.pdf', 'total_pages': 128, 'format': 'PDF 1.7', 'title': 'Coal 2025', 'author': 'IEA - International Energy Agency', 'subject': 'Coal 2025', 'keywords': 'Coal 2025', 'moddate': '2025-12-17T09:37:48+01:00', 'trapped': '', 'modDate': "D:20251217093748+01'00'", 'creationDate': "D:20251216180119+01'00'", 'page': 0}
Content preview: Coal
2025
Analysis and forecast to 2030

--- Chunk 2 ---
Metadata: {'producer': 'Adobe PDF Library 25.1.5', 'creator': 'Acrobat PDFMaker 25 for Word', 'creationdate': '2025-12-16T18:01:19+01:00', 'source': 'C:\\Users\\13015\\Desktop\\Final_Project\\Coal2025.pdf', 'file_path': 'C:\\Users\\13015\\Desktop\\Final_Project\\Coal2025.pdf', 'total_pages': 128, 'format': 'PDF 1.7', 'title': 'Coal 2025', 'author': 'IEA - International Energy Agency', 'subject': 'Coal 2025', 'keywords': 'Coal 2025', 'moddate': '2025-12-17T09:37:48+01:00', 'trapped': '', 'modDate': "D:20251217093748+01'00'", 'creationDate': "D:20251216180119+01'00'", 'page': 1}
Content preview: The IEA examines the full 
spectrum 
of energy issues 
including oil, gas and 
coal supply and 
demand, renewable 
energy technologies, 
electricity markets, 
energy efficiency, 
access to energy, 
demand side 
management and much 
more. Through its work, 
the IEA advocates 
policies that will enhance 
the reliability, 
affordability and 
sustainability of energy 
in its  
32 Member countries,   
13 Association countries 
and beyond.
This publication and any map 
included herein are without 
pre

--- Chunk 3 ---
Metadata: {'producer': 'Adobe PDF Library 25.1.5', 'creator': 'Acrobat PDFMaker 25 for Word', 'creationdate': '2025-12-16T18:01:19+01:00', 'source': 'C:\\Users\\13015\\Desktop\\Final_Project\\Coal2025.pdf', 'file_path': 'C:\\Users\\13015\\Desktop\\Final_Project\\Coal2025.pdf', 'total_pages': 128, 'format': 'PDF 1.7', 'title': 'Coal 2025', 'author': 'IEA - International Energy Agency', 'subject': 'Coal 2025', 'keywords': 'Coal 2025', 'moddate': '2025-12-17T09:37:48+01:00', 'trapped': '', 'modDate': "D:20251217093748+01'00'", 'creationDate': "D:20251216180119+01'00'", 'page': 1}
Content preview: Czech Republic 
Denmark
Estonia
Finland
France
Germany
Greece
Hungary
Ireland
Italy
Japan
Korea
Latvia
Lithuania 
Luxembourg 
Mexico 
Netherlands
New Zealand 
Norway
Poland
Portugal
Slovak Republic 
Spain
Sweden 
Switzerland 
Republic of Türkiye 
United Kingdom 
United States
The European 
Commission also 
participates in the 
work of the IEA
IEA Association 
countries:
Argentina 
Brazil
China
Egypt
India 
Indonesia 
Kenya 
Morocco 
Senegal 
Singapore 
South Africa 
Thailand 
Ukraine
INTERNATION

chunk_lengths = [len(doc.page_content) for doc in chunked_docs]

print("Minimum chunk length:", min(chunk_lengths))
print("Maximum chunk length:", max(chunk_lengths))
print("Average chunk length:", sum(chunk_lengths) / len(chunk_lengths))

Minimum chunk length: 38
Maximum chunk length: 1000
Average chunk length: 841.3240670220869

from sentence_transformers import SentenceTransformer

buffer = io.StringIO()

with redirect_stdout(buffer), redirect_stderr(buffer):
    embedder = SentenceTransformer("all-MiniLM-L6-v2")

print("Embedding model loaded successfully.")

Embedding model loaded successfully.

results = retrieve_top_k(queries[0], k=4)

print("Number of retrieved chunks:", len(results))

for i, res in enumerate(results, start=1):
    print(f"\n--- Retrieved Chunk {i} ---")
    print("Score:", round(res["score"], 4))
    print("Metadata:", res["metadata"])
    print("Content preview:")
    print(res["content"][:700])

Number of retrieved chunks: 4

--- Retrieved Chunk 1 ---
Score: 0.7571
Metadata: {'producer': 'Adobe PDF Library 25.1.5', 'creator': 'Acrobat PDFMaker 25 for Word', 'creationdate': '2025-12-16T18:01:19+01:00', 'source': 'C:\\Users\\13015\\Desktop\\Final_Project\\Coal2025.pdf', 'file_path': 'C:\\Users\\13015\\Desktop\\Final_Project\\Coal2025.pdf', 'total_pages': 128, 'format': 'PDF 1.7', 'title': 'Coal 2025', 'author': 'IEA - International Energy Agency', 'subject': 'Coal 2025', 'keywords': 'Coal 2025', 'moddate': '2025-12-17T09:37:48+01:00', 'trapped': '', 'modDate': "D:20251217093748+01'00'", 'creationDate': "D:20251216180119+01'00'", 'page': 105}
Content preview:
Coal 2025 
Investments in coal projects and emissions abatement 
Analysis and forecast to 2030 
 
PAGE | 106  
I EA. CC BY 4.0. 
Australian assets. Lastly, the Quintette mine, owned by Conuma Resources and 
with a capacity of 1 Mtpa, came online in September 2024.  
The support of the US Administration is among the drivers of new investment. 
US producer Ramaco Resources has recently purchased the Maben Coal 
property in West Virginia. Ramaco has already started one new surface mine on 
this property, Maben Highwall Mine No. 3, and is also permitting and designing 
additional deep mines at this complex: the Beckley Crystal Mine, Slick Rock Sewell 
Mine, Allen Creek No. 1 Mine and the Maben N

--- Retrieved Chunk 2 ---
Score: 0.7113
Metadata: {'producer': 'Adobe PDF Library 25.1.5', 'creator': 'Acrobat PDFMaker 25 for Word', 'creationdate': '2025-12-16T18:01:19+01:00', 'source': 'C:\\Users\\13015\\Desktop\\Final_Project\\Coal2025.pdf', 'file_path': 'C:\\Users\\13015\\Desktop\\Final_Project\\Coal2025.pdf', 'total_pages': 128, 'format': 'PDF 1.7', 'title': 'Coal 2025', 'author': 'IEA - International Energy Agency', 'subject': 'Coal 2025', 'keywords': 'Coal 2025', 'moddate': '2025-12-17T09:37:48+01:00', 'trapped': '', 'modDate': "D:20251217093748+01'00'", 'creationDate': "D:20251216180119+01'00'", 'page': 109}
Content preview:
are forecasts. 
Table 4: Total coal production (Mt), 2024-2030 
Region/country  
2024 
2025 
2027 
2030 
2024-25 
CAAGR 
2025-2030 
Asia Pacific 
7311 
7274 
7096 
7079 
-0.5% 
-0.5% 
China 
4666 
4730 
4563 
4439 
0.8% 
-1.2% 
India 
1082 
1089 
1154 
1283 
0.6% 
3.3% 
Australia 
474 
446 
438 
409 
-6.0% 
-1.7% 
Mongolia 
104 
105 
97 
102 
1.6% 
-0.7% 
Indonesia 
836 
778 
713 
671 
-6.9% 
-2.9% 
North America 
513 
529 
510 
436 
3.0% 
-3.8% 
United States 
461 
473 
456 
386 
2.7% 
-4.0% 
Central and South 
America 
82 
67 
58 
49 
-17.7% 
-6.0% 
Europe 
378 
370 
303 
237 
-2.1% 
-8.5% 
European Union 
242 
242 
183 
132 
-0.1% 
-11.4% 
Eurasia 
564 
607 
599 
580 
7.6% 
-0.9% 
Russia 

--- Retrieved Chunk 3 ---
Score: 0.7077
Metadata: {'producer': 'Adobe PDF Library 25.1.5', 'creator': 'Acrobat PDFMaker 25 for Word', 'creationdate': '2025-12-16T18:01:19+01:00', 'source': 'C:\\Users\\13015\\Desktop\\Final_Project\\Coal2025.pdf', 'file_path': 'C:\\Users\\13015\\Desktop\\Final_Project\\Coal2025.pdf', 'total_pages': 128, 'format': 'PDF 1.7', 'title': 'Coal 2025', 'author': 'IEA - International Energy Agency', 'subject': 'Coal 2025', 'keywords': 'Coal 2025', 'moddate': '2025-12-17T09:37:48+01:00', 'trapped': '', 'modDate': "D:20251217093748+01'00'", 'creationDate': "D:20251216180119+01'00'", 'page': 100}
Content preview:
newly proposed projects are dominated by thermal and mixed coal. Australia 
accounts for 56% of the global coal project pipeline, although this figure should be 
interpreted with caution due to the high level of transparency in project reporting 
compared with other major coal exporters. Five projects have recently moved from 
the less-advanced to the more-advanced stages: Caval Ridge Mine Horse Pit 
Extension, Jellinbah Central North Extension, Lake Vermont Meadowbrook 
Project, Mandalong Southern Extension and Mount Thorley Warkworth Extension. 
In April 2024, the Queensland government approved the environmental impact 
statement for the Lake Vermont Meadowbrook Project, an extension of La

--- Retrieved Chunk 4 ---
Score: 0.6842
Metadata: {'producer': 'Adobe PDF Library 25.1.159', 'creator': 'Acrobat PDFMaker 25 for Word', 'creationdate': '2026-02-06T06:16:44+01:00', 'source': 'C:\\Users\\13015\\Desktop\\Final_Project\\Electricity2026.pdf', 'file_path': 'C:\\Users\\13015\\Desktop\\Final_Project\\Electricity2026.pdf', 'total_pages': 225, 'format': 'PDF 1.7', 'title': 'Electricity 2026', 'author': 'IEA - International Energy Agency', 'subject': 'Electricity 2026', 'keywords': 'Electricity 2026', 'moddate': '2026-02-13T10:15:00+01:00', 'trapped': '', 'modDate': "D:20260213101500+01'00'", 'creationDate': "D:20260206061644+01'00'", 'page': 26}
Content preview:
driven, on average, 15% of monthly demand nationwide in recent years. 
In the five-year period between 2021 and 2025, net electricity demand in India 
grew by close to 430 TWh. Space cooling contributed 15% to total demand growth, 
and to around one-third of the gains in the buildings sector. This sector, which 
includes households and services, has driven half of the total growth in India over 
the past five years. Industry accounted for 36% of total growth, while agriculture 
and transport provided the rest.  
Over the forecast period, we expect demand in India to grow at an average 6.4% 
per year through 2030, in line with IMF’s GDP forecasts. India is expected to add 
over 570 TWh to its

all_docs = []

for pdf_path in pdf_files:
    loader = PyMuPDFLoader(pdf_path)
    docs = loader.load()
    all_docs.extend(docs)

print("Total document pages loaded:", len(all_docs))

Total document pages loaded: 868

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200
)

chunked_docs = text_splitter.split_documents(all_docs)

print("Total chunks created:", len(chunked_docs))

Total chunks created: 2626

from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

MODEL_NAME = "google/flan-t5-small"

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)

print("Base LLM loaded successfully.")

Loading weights: 100%|██████████| 190/190 [00:00<00:00, 13633.71it/s]

Base LLM loaded successfully.

base_llm_outputs = []

for i, query in enumerate(queries, start=1):
    print(f"\nGenerating Base LLM response for Query {i}...\n")
    
    answer = get_base_llm_response(query)
    
    base_llm_outputs.append({
        "query_id": i,
        "query": query,
        "answer": answer
    })
    
    print(answer)
    print("\n" + "=" * 100)

Generating Base LLM response for Query 1...

iii)

====================================================================================================

Generating Base LLM response for Query 2...

unanswerable

====================================================================================================

Generating Base LLM response for Query 3...

Increasing in demand for electricity and the fall in demand for gas in domestic coal-fired power plants

====================================================================================================

Generating Base LLM response for Query 4...

A lack of resources and adequate resources to develop a commercially viable energy supply

====================================================================================================

Generating Base LLM response for Query 5...

a reduction in fuel efficiency

====================================================================================================

base_llm_df = pd.DataFrame(base_llm_outputs)
base_llm_df

base_llm_df.to_csv("base_llm_outputs.csv", index=False)
print("Base LLM outputs saved.")

Base LLM outputs saved.

def get_prompt_engineered_response(query, max_new_tokens=220):
    prompt = f"""
You are an expert energy market analyst.

Provide a structured, factual, and concise answer.
Focus on energy markets, infrastructure, policy, and investment implications.
Avoid unsupported claims and avoid unnecessary speculation.

Question:
{query}
"""
    
    inputs = tokenizer(
        prompt,
        return_tensors="pt",
        truncation=True,
        max_length=512
    )
    
    outputs = model.generate(
        **inputs,
        max_new_tokens=max_new_tokens,
        do_sample=True,
        temperature=0.5
    )
    
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response

test_prompt_response = get_prompt_engineered_response(queries[0])
print(test_prompt_response)

Using artificial intelligence data

prompt_outputs = []

for i, query in enumerate(queries, start=1):
    print(f"\nGenerating Prompt-Engineered response for Query {i}...\n")
    
    answer = get_prompt_engineered_response(query)
    
    prompt_outputs.append({
        "query_id": i,
        "query": query,
        "answer": answer
    })
    
    print(answer)
    print("\n" + "=" * 100)

Generating Prompt-Engineered response for Query 1...

a global expansion of artificial intelligence data centres

====================================================================================================

Generating Prompt-Engineered response for Query 2...

$90 billion

====================================================================================================

Generating Prompt-Engineered response for Query 3...

energy markets, infrastructure, policy, and investment implications.

====================================================================================================

Generating Prompt-Engineered response for Query 4...

Economists are comparing energy markets to energy markets.

====================================================================================================

Generating Prompt-Engineered response for Query 5...

impact on capacity rationalisation of traditional US West Coast refineries

====================================================================================================

prompt_df = pd.DataFrame(prompt_outputs)
prompt_df

prompt_df.to_csv("prompt_engineered_outputs.csv", index=False)
print("Prompt-engineered outputs saved.")

Prompt-engineered outputs saved.

base_llm_df = pd.DataFrame(base_llm_outputs)
base_llm_df

comparison_df = pd.DataFrame({
    "query_id": range(1, len(base_llm_df) + 1),
    "query": base_llm_df["query"],
    "base_answer": base_llm_df["answer"],
    "prompt_engineered_answer": prompt_df["answer"]
})

comparison_df

def build_context_from_results(results):
    context_parts = []
    
    for i, res in enumerate(results, start=1):
        source = res["metadata"].get("source", "Unknown Source")
        page = res["metadata"].get("page", "Unknown Page")
        content = res["content"]
        
        context_parts.append(
            f"[Source {i}] File: {source}, Page: {page}\n{content}"
        )
    
    return "\n\n".join(context_parts)

def get_rag_response(query, k=4, max_new_tokens=250):
    results = retrieve_top_k(query, k=k)
    context = build_context_from_results(results)
    
    prompt = f"""
You are an expert energy market analyst.

Answer the question using ONLY the provided context.
If the answer is not supported by the context, say that the available context does not fully support the answer.
Be concise, analytical, and business-focused.

Context:
{context}

Question:
{query}
"""
    
    inputs = tokenizer(
        prompt,
        return_tensors="pt",
        truncation=True,
        max_length=1024
    )
    
    outputs = model.generate(
        **inputs,
        max_new_tokens=max_new_tokens,
        do_sample=True,
        temperature=0.3
    )
    
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    return {
        "answer": response,
        "context": context,
        "retrieved_results": results
    }

test_rag_output = get_rag_response(queries[0], k=4)

print("RAG Answer:\n")
print(test_rag_output["answer"])

RAG Answer:

India, is a major energy sector.

rag_outputs = []

for i, query in enumerate(queries, start=1):
    print(f"\nGenerating RAG response for Query {i}...\n")
    
    rag_result = get_rag_response(query, k=4)
    
    rag_outputs.append({
        "query_id": i,
        "query": query,
        "answer": rag_result["answer"],
        "context": rag_result["context"]
    })
    
    print(rag_result["answer"])
    print("\n" + "=" * 100)

Generating RAG response for Query 1...

India, has a net electricity demand of about 430 TWh.

====================================================================================================

Generating RAG response for Query 2...

a reorganization of the US-Mexico-Canada Agreement

====================================================================================================

Generating RAG response for Query 3...

a shift towards integrating biomass and hydrogen into coal conversion routes, which will reduce coal consumption in chemical production.

====================================================================================================

Generating RAG response for Query 4...

0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0

====================================================================================================

Generating RAG response for Query 5...

Using domestic versus imported feedstocks for SAF is a risk to the investment economics of using domestic versus imported feedstocks for sustainable aviation fuel.

====================================================================================================

rag_df = pd.DataFrame(rag_outputs)
rag_df

rag_df.to_csv("rag_outputs.csv", index=False)
print("RAG outputs saved.")

RAG outputs saved.

query_idx = 0

print("QUERY:\n")
print(queries[query_idx])

print("\nBASE LLM ANSWER:\n")
print(base_llm_df.loc[query_idx, "answer"])

print("\nPROMPT-ENGINEERED ANSWER:\n")
print(prompt_df.loc[query_idx, "answer"])

print("\nRAG ANSWER:\n")
print(rag_df.loc[query_idx, "answer"])

QUERY:

How is the rapid global expansion of artificial intelligence data centres impacting overall electricity demand and straining existing power grid infrastructure?

BASE LLM ANSWER:

iii)

PROMPT-ENGINEERED ANSWER:

a global expansion of artificial intelligence data centres

RAG ANSWER:

India, has a net electricity demand of about 430 TWh.

def get_tuned_rag_response(query, k=3, max_new_tokens=220):
    results = retrieve_top_k(query, k=k)
    context = build_context_from_results(results)
    
    prompt = f"""
You are an expert energy market analyst preparing a concise investment-oriented briefing.

Use ONLY the provided context.
Do not rely on outside knowledge.
If the context does not fully support a claim, clearly state that the available context is limited.
Focus on market impact, infrastructure constraints, policy implications, and business relevance.
Write clearly and concisely.

Context:
{context}

Question:
{query}
"""
    
    inputs = tokenizer(
        prompt,
        return_tensors="pt",
        truncation=True,
        max_length=1024
    )
    
    outputs = model.generate(
        **inputs,
        max_new_tokens=max_new_tokens,
        do_sample=True,
        temperature=0.2
    )
    
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    return {
        "answer": response,
        "context": context,
        "retrieved_results": results
    }

test_tuned_rag = get_tuned_rag_response(queries[0], k=3)

print("Tuned RAG Answer:\n")
print(test_tuned_rag["answer"])

Tuned RAG Answer:

The rapid global expansion of artificial intelligence data centres is impacting overall electricity demand and straining existing power grid infrastructure

tuned_rag_outputs = []

for i, query in enumerate(queries, start=1):
    print(f"\nGenerating Tuned RAG response for Query {i}...\n")
    
    tuned_result = get_tuned_rag_response(query, k=3)
    
    tuned_rag_outputs.append({
        "query_id": i,
        "query": query,
        "answer": tuned_result["answer"],
        "context": tuned_result["context"]
    })
    
    print(tuned_result["answer"])
    print("\n" + "=" * 100)

Generating Tuned RAG response for Query 1...

The rapid global expansion of artificial intelligence data centres is impacting overall electricity demand and straining existing power grid infrastructure

====================================================================================================

Generating Tuned RAG response for Query 2...

a wave of new US liquefied natural gas (LNG) export capacity expected to impact natural gas affordability and spur additional demand in price-sensitive Asian markets by 2030

====================================================================================================

Generating Tuned RAG response for Query 3...

The surge in US electricity demand and the 2025 federal emergency policy interventions collectively affecting the retirement schedules, capacity planning, and generation output of domestic coal-fired power plants

====================================================================================================

Generating Tuned RAG response for Query 4...

Increasing frequency of negative wholesale electricity prices and the regulatory shift towards two-sided Contracts for Difference (CfDs) in Europe altering the revenue expectations and financial agility of developers investing in utility-scale solar PV

====================================================================================================

Generating Tuned RAG response for Query 5...

The 'One Big Beautiful Bill Act' (OBBBA) affects the investment economics of using domestic versus imported feedstocks for Sustainable Aviation Fuel (SAF), and the impact of this biofuel transition on the capacity rationalisation of traditional US West Coast refineries

====================================================================================================

tuned_rag_df = pd.DataFrame(tuned_rag_outputs)
tuned_rag_df

tuned_rag_df.to_csv("tuned_rag_outputs.csv", index=False)
print("Tuned RAG outputs saved.")

Tuned RAG outputs saved.

# Example RAG tuning comparison table

import pandas as pd

tuning_results = pd.DataFrame({
    "Configuration": [
        "Top-k = 2",
        "Top-k = 3",
        "Top-k = 4",
        "Top-k = 5"
    ],
    
    "Advantages": [
        "Highly focused retrieval with minimal irrelevant context",
        "Balanced retrieval with strong relevance and sufficient detail",
        "More contextual coverage for complex questions",
        "Maximum information retrieval"
    ],
    
    "Disadvantages": [
        "Sometimes missed supporting details",
        "Occasional minor redundancy",
        "Increased irrelevant context",
        "Higher noise and reduced answer focus"
    ],
    
    "Observed Performance": [
        "Good precision but incomplete answers",
        "Best overall balance of precision and completeness",
        "Moderate improvement in completeness but reduced focus",
        "Context overload and weaker response quality"
    ],
    
    "Final Assessment": [
        "Not selected",
        "Selected",
        "Partially effective",
        "Not recommended"
    ]
})

display(tuning_results)

# Inspect retrieved chunks from the tuned RAG system

sample_queries = [
    "What are the major causes of power outages?",
    "How can renewable energy improve grid reliability?",
    "What challenges affect modern energy infrastructure?"
]

for query in sample_queries:
    
    print("=" * 80)
    print(f"QUERY: {query}")
    print("=" * 80)

    results = retrieve_top_k(query, k=3)

    for i, res in enumerate(results, start=1):

        print(f"\nRetrieved Chunk {i}:\n")

        if "score" in res:
            print("Score:", round(res["score"], 4))

        if "metadata" in res:
            print("Metadata:", res["metadata"])

        print("\nContent Preview:")

        if "content" in res:
            print(res["content"][:1000])

        print("\n" + "-" * 80)

================================================================================
QUERY: What are the major causes of power outages?
================================================================================

Retrieved Chunk 1:

Score: 0.6502
Metadata: {'producer': 'Adobe PDF Library 25.1.159', 'creator': 'Acrobat PDFMaker 25 for Word', 'creationdate': '2026-02-06T06:16:44+01:00', 'source': 'C:\\Users\\13015\\Desktop\\Final_Project\\Electricity2026.pdf', 'file_path': 'C:\\Users\\13015\\Desktop\\Final_Project\\Electricity2026.pdf', 'total_pages': 225, 'format': 'PDF 1.7', 'title': 'Electricity 2026', 'author': 'IEA - International Energy Agency', 'subject': 'Electricity 2026', 'keywords': 'Electricity 2026', 'moddate': '2026-02-13T10:15:00+01:00', 'trapped': '', 'modDate': "D:20260213101500+01'00'", 'creationDate': "D:20260206061644+01'00'", 'page': 79}

Content Preview:
were contracted, representing around 4% of total cleared capacity. OCCTO 
conducted a one-year-ahead additional capacity auction in 2024 for the FY 2025 
delivery year, in which 270 MW of activation-command resources were offered 
and contracted. Following the additional auction, activation-command resources 
amounted to around 3.2 GW of total capacity secured in the capacity market for 
FY 2025. 
Australia has brought flexible demand into wholesale dispatch via the Wholesale 
Demand Response Mechanism. AEMO’s annual 2025 Wholesale Demand Report 
shows around 74 MW of registered capacity across 158 sites, representing less 
than 1% of peak demand. Wholesale demand response was dispatched on 
23 days (240 MWh) between July 2024 and May 2025, reflecting small-scale but 
operational integration into wholesale dispatch. In January 2026, the government 
also announced the Solar Sharer Offer, which is scheduled to begin on

--------------------------------------------------------------------------------

Retrieved Chunk 2:

Score: 0.6167
Metadata: {'producer': 'Adobe PDF Library 25.1.5', 'creator': 'Acrobat PDFMaker 25 for Word', 'creationdate': '2025-12-16T18:01:19+01:00', 'source': 'C:\\Users\\13015\\Desktop\\Final_Project\\Coal2025.pdf', 'file_path': 'C:\\Users\\13015\\Desktop\\Final_Project\\Coal2025.pdf', 'total_pages': 128, 'format': 'PDF 1.7', 'title': 'Coal 2025', 'author': 'IEA - International Energy Agency', 'subject': 'Coal 2025', 'keywords': 'Coal 2025', 'moddate': '2025-12-17T09:37:48+01:00', 'trapped': '', 'modDate': "D:20251217093748+01'00'", 'creationDate': "D:20251216180119+01'00'", 'page': 108}

Content Preview:
9 
10 
-5.1% 
0.6% 
World 
8805 
8845 
8724 
8579 
0.5% 
-0.6% 
Notes: CAAGR = compound average annual growth rate. Data for 2024 are preliminary; 2025 are estimated; 2026 to 2030 
are forecasts. 
Table 2: Thermal coal and lignite consumption (Mt), 2024-2030 
Region/country  
2024 
2025 
2027 
2030 
2024-25 
CAAGR 
2025-2030 
Asia Pacific 
6346 
6331 
6377 
6436 
-0.2% 
0.3% 
China 
4218 
4211 
4166 
4108 
-0.2% 
-0.5% 
India 
1233 
1212 
1289 
1412 
-1.7% 
3.1% 
Japan 
123 
123 
105 
80 
0.4% 
-8.4% 
ASEAN 
470 
486 
534 
593 
3.4% 
4.1% 
North America 
387 
422 
375 
304 
9.1% 
-6.3% 
United States 
359 
396 
355 
289 
10.3% 
-6.1% 
Central and South 
America 
31 
28 
25 
23 
-10.6% 
-3.5% 
Europe 
434 
425 
324 
239 
-2.0% 
-10.9% 
European Union 
260 
257 
178 
113 
-1.2% 
-15.2% 
Eurasia 
305 
326 
325 
310 
7.0% 
-1.0% 
Africa 
192 
190 
196 
197 
-0.8% 
0.7% 
Middle East 
8 
7 
7 
7 
-6.9% 
-0.2% 
World 
7703 
7731 
7630 
7518 
0.4% 
-0.6%

--------------------------------------------------------------------------------

Retrieved Chunk 3:

Score: 0.6003
Metadata: {'producer': 'Adobe PDF Library 25.1.159', 'creator': 'Acrobat PDFMaker 25 for Word', 'creationdate': '2026-02-06T06:16:44+01:00', 'source': 'C:\\Users\\13015\\Desktop\\Final_Project\\Electricity2026.pdf', 'file_path': 'C:\\Users\\13015\\Desktop\\Final_Project\\Electricity2026.pdf', 'total_pages': 225, 'format': 'PDF 1.7', 'title': 'Electricity 2026', 'author': 'IEA - International Energy Agency', 'subject': 'Electricity 2026', 'keywords': 'Electricity 2026', 'moddate': '2026-02-13T10:15:00+01:00', 'trapped': '', 'modDate': "D:20260213101500+01'00'", 'creationDate': "D:20260206061644+01'00'", 'page': 77}

Content Preview:
0
 10
 20
 30
 40
 50
 60
 70
 80
 90
 100
China
Denmark
Finland
Italy
Japan
Sweden
Estonia
Latvia
Luxmbourg
Norway
Spain
Portugal
Qatar
Austria
Slovenia
USA
France
Malta
Netherlands
Korea
Ireland
Great Britain
Lithuania
Belgium
Poland
Croatia
Romania
India
Slovakia
Hungary
Greece
Germany
Czechia
Bulgaria
Cyprus
Share of customers (%)

--------------------------------------------------------------------------------
================================================================================
QUERY: How can renewable energy improve grid reliability?
================================================================================

Retrieved Chunk 1:

Score: 0.6131
Metadata: {'producer': 'Adobe PDF Library 25.1.51', 'creator': 'Acrobat PDFMaker 25 for Word', 'creationdate': '2025-06-16T17:08:29+02:00', 'source': 'C:\\Users\\13015\\Desktop\\Final_Project\\Oil2025.pdf', 'file_path': 'C:\\Users\\13015\\Desktop\\Final_Project\\Oil2025.pdf', 'total_pages': 152, 'format': 'PDF 1.7', 'title': 'Oil 2025: Analysis and forecast to 2030', 'author': 'Oil 2025: Analysis and forecast to 2030', 'subject': 'Oil 2025: Analysis and forecast to 2030', 'keywords': 'Oil 2025: Analysis and forecast to 2030', 'moddate': '2025-07-11T12:01:18+02:00', 'trapped': '', 'modDate': "D:20250711120118+02'00'", 'creationDate': "D:20250616170829+02'00'", 'page': 35}

Content Preview:
stagnation or declines elsewhere in the product spectrum, the only ones posting 
steady growth in our 2024-30 forecast are jet/kerosene and LPG/ethane, for 
aggregate increases of 180 kb/d and 370 kb/d, respectively. For LPG/ethane, the 
average growth rate of 1.7% stands in marked contrast to its 4% annual increase 
between 2019 and 2024. This slowdown is mainly due to the lack of further 
substantial petrochemical capacity expansions that characterised the early 2020s.

--------------------------------------------------------------------------------

Retrieved Chunk 2:

Score: 0.5829
Metadata: {'producer': 'Adobe PDF Library 25.1.51', 'creator': 'Acrobat PDFMaker 25 for Word', 'creationdate': '2025-06-16T17:08:29+02:00', 'source': 'C:\\Users\\13015\\Desktop\\Final_Project\\Oil2025.pdf', 'file_path': 'C:\\Users\\13015\\Desktop\\Final_Project\\Oil2025.pdf', 'total_pages': 152, 'format': 'PDF 1.7', 'title': 'Oil 2025: Analysis and forecast to 2030', 'author': 'Oil 2025: Analysis and forecast to 2030', 'subject': 'Oil 2025: Analysis and forecast to 2030', 'keywords': 'Oil 2025: Analysis and forecast to 2030', 'moddate': '2025-07-11T12:01:18+02:00', 'trapped': '', 'modDate': "D:20250711120118+02'00'", 'creationDate': "D:20250616170829+02'00'", 'page': 85}

Content Preview:
0.9
0.2
Direct use of crude oil
1.0
1.0
0.9
0.8
0.7
0.6
0.5
-0.5
Total call on oil products
97.9
98.4
99.1
99.7
99.9
100.1
100.0
2.1
Fractionation products**
12.2
12.6
12.9
13.4
13.7
13.9
14.1
1.9
Refined product demand
85.6
85.8
86.2
86.3
86.2
86.1
85.9
0.3
Refinery market share
83.1%
82.7%
82.5%
82.1%
81.8%
81.6%
81.4%
-1.7%
0.0
0.5
1.0
1.5
2.0
2.5
3.0
3.5
4.0
4.5
New sites
Expansions
Shutdowns
Net additions
Total demand
growth
mb/d
Non-refined product
Refined product
Global
Europe
Asia
North America
China
Africa
Middle East
Other

--------------------------------------------------------------------------------

Retrieved Chunk 3:

Score: 0.5819
Metadata: {'producer': 'Adobe PDF Library 25.1.51', 'creator': 'Acrobat PDFMaker 25 for Word', 'creationdate': '2025-06-16T17:08:29+02:00', 'source': 'C:\\Users\\13015\\Desktop\\Final_Project\\Oil2025.pdf', 'file_path': 'C:\\Users\\13015\\Desktop\\Final_Project\\Oil2025.pdf', 'total_pages': 152, 'format': 'PDF 1.7', 'title': 'Oil 2025: Analysis and forecast to 2030', 'author': 'Oil 2025: Analysis and forecast to 2030', 'subject': 'Oil 2025: Analysis and forecast to 2030', 'keywords': 'Oil 2025: Analysis and forecast to 2030', 'moddate': '2025-07-11T12:01:18+02:00', 'trapped': '', 'modDate': "D:20250711120118+02'00'", 'creationDate': "D:20250616170829+02'00'", 'page': 85}

Content Preview:
Europe and the United States. To restore balance, utilisation rates must drop, or 
closures must accelerate beyond historical levels. Both factors will reshape the 
global refining industry, with reductions dictated by refinery competitiveness. 
High-cost regions like Europe and the US West Coast are most likely to see further 
cuts. 
Refinery expansion and closures and demand growth, 2024-2030 
 
IEA. CC BY 4.0. 
Note: Refined product demand net of CTL/GTL, additives, biofuels, NGLs and direct use of crude.  
 
2024
2025
2026
2027
2028
2029
2030
2024-30 
growth
Total liquids demand
103.0
103.8
104.5
105.1
105.4
105.6
105.5
2.5
Biofuels
3.4
3.5
3.7
3.8
3.9
4.0
4.1
0.7
Total Oil demand
99.6
100.3
100.8
101.4
101.5
101.5
101.4
1.8
CTL/GTL*/additives
0.8
0.8
0.8
0.9
0.9
0.9
0.9
0.2
Direct use of crude oil
1.0
1.0
0.9
0.8
0.7
0.6
0.5
-0.5
Total call on oil products
97.9
98.4
99.1
99.7
99.9
100.1
100.0
2.1
Fractionation products**
12.2
12.6
12.9
13.4
13.7
13.9
14.1
1.9

--------------------------------------------------------------------------------
================================================================================
QUERY: What challenges affect modern energy infrastructure?
================================================================================

Retrieved Chunk 1:

Score: 0.5753
Metadata: {'producer': 'Adobe PDF Library 25.1.159', 'creator': 'Acrobat PDFMaker 25 for Word', 'creationdate': '2026-02-06T06:16:44+01:00', 'source': 'C:\\Users\\13015\\Desktop\\Final_Project\\Electricity2026.pdf', 'file_path': 'C:\\Users\\13015\\Desktop\\Final_Project\\Electricity2026.pdf', 'total_pages': 225, 'format': 'PDF 1.7', 'title': 'Electricity 2026', 'author': 'IEA - International Energy Agency', 'subject': 'Electricity 2026', 'keywords': 'Electricity 2026', 'moddate': '2026-02-13T10:15:00+01:00', 'trapped': '', 'modDate': "D:20260213101500+01'00'", 'creationDate': "D:20260206061644+01'00'", 'page': 118}

Content Preview:
residential consumers. Between 2019 and 2024, electricity prices for households 
increased by 36% on average in the European Union and by 26% in the 
United States. During the same period, annual net earnings for a two-earner 
couple with two children increased by 25% in the European Union, and by 23% in 
the United States, while inflation rates during this period were 22% and 23%, 
respectively.  
Household demand for electricity is generally price-inelastic, meaning it responds 
only modestly to variations in retail prices given the essential nature of many end 
uses (like lighting, refrigeration, cooking, electronic devices). Therefore, increases 
in prices tend to translate into higher household expenditure on electricity, 
especially in the short term.  
Recent changes in electricity prices have altered the weight of electricity in total 
household expenditure, but the effect varies by region. In advanced economies

--------------------------------------------------------------------------------

Retrieved Chunk 2:

Score: 0.5682
Metadata: {'producer': 'Adobe PDF Library 25.1.5', 'creator': 'Acrobat PDFMaker 25 for Word', 'creationdate': '2025-12-16T18:01:19+01:00', 'source': 'C:\\Users\\13015\\Desktop\\Final_Project\\Coal2025.pdf', 'file_path': 'C:\\Users\\13015\\Desktop\\Final_Project\\Coal2025.pdf', 'total_pages': 128, 'format': 'PDF 1.7', 'title': 'Coal 2025', 'author': 'IEA - International Energy Agency', 'subject': 'Coal 2025', 'keywords': 'Coal 2025', 'moddate': '2025-12-17T09:37:48+01:00', 'trapped': '', 'modDate': "D:20251217093748+01'00'", 'creationDate': "D:20251216180119+01'00'", 'page': 108}

Content Preview:
9 
10 
-5.1% 
0.6% 
World 
8805 
8845 
8724 
8579 
0.5% 
-0.6% 
Notes: CAAGR = compound average annual growth rate. Data for 2024 are preliminary; 2025 are estimated; 2026 to 2030 
are forecasts. 
Table 2: Thermal coal and lignite consumption (Mt), 2024-2030 
Region/country  
2024 
2025 
2027 
2030 
2024-25 
CAAGR 
2025-2030 
Asia Pacific 
6346 
6331 
6377 
6436 
-0.2% 
0.3% 
China 
4218 
4211 
4166 
4108 
-0.2% 
-0.5% 
India 
1233 
1212 
1289 
1412 
-1.7% 
3.1% 
Japan 
123 
123 
105 
80 
0.4% 
-8.4% 
ASEAN 
470 
486 
534 
593 
3.4% 
4.1% 
North America 
387 
422 
375 
304 
9.1% 
-6.3% 
United States 
359 
396 
355 
289 
10.3% 
-6.1% 
Central and South 
America 
31 
28 
25 
23 
-10.6% 
-3.5% 
Europe 
434 
425 
324 
239 
-2.0% 
-10.9% 
European Union 
260 
257 
178 
113 
-1.2% 
-15.2% 
Eurasia 
305 
326 
325 
310 
7.0% 
-1.0% 
Africa 
192 
190 
196 
197 
-0.8% 
0.7% 
Middle East 
8 
7 
7 
7 
-6.9% 
-0.2% 
World 
7703 
7731 
7630 
7518 
0.4% 
-0.6%

--------------------------------------------------------------------------------

Retrieved Chunk 3:

Score: 0.5315
Metadata: {'producer': 'Adobe PDF Library 25.1.51', 'creator': 'Acrobat PDFMaker 25 for Word', 'creationdate': '2025-06-16T17:08:29+02:00', 'source': 'C:\\Users\\13015\\Desktop\\Final_Project\\Oil2025.pdf', 'file_path': 'C:\\Users\\13015\\Desktop\\Final_Project\\Oil2025.pdf', 'total_pages': 152, 'format': 'PDF 1.7', 'title': 'Oil 2025: Analysis and forecast to 2030', 'author': 'Oil 2025: Analysis and forecast to 2030', 'subject': 'Oil 2025: Analysis and forecast to 2030', 'keywords': 'Oil 2025: Analysis and forecast to 2030', 'moddate': '2025-07-11T12:01:18+02:00', 'trapped': '', 'modDate': "D:20250711120118+02'00'", 'creationDate': "D:20250616170829+02'00'", 'page': 85}

Content Preview:
0.9
0.2
Direct use of crude oil
1.0
1.0
0.9
0.8
0.7
0.6
0.5
-0.5
Total call on oil products
97.9
98.4
99.1
99.7
99.9
100.1
100.0
2.1
Fractionation products**
12.2
12.6
12.9
13.4
13.7
13.9
14.1
1.9
Refined product demand
85.6
85.8
86.2
86.3
86.2
86.1
85.9
0.3
Refinery market share
83.1%
82.7%
82.5%
82.1%
81.8%
81.6%
81.4%
-1.7%
0.0
0.5
1.0
1.5
2.0
2.5
3.0
3.5
4.0
4.5
New sites
Expansions
Shutdowns
Net additions
Total demand
growth
mb/d
Non-refined product
Refined product
Global
Europe
Asia
North America
China
Africa
Middle East
Other

--------------------------------------------------------------------------------

# Grounding evaluation table

import pandas as pd

grounding_eval = pd.DataFrame({

    "Query": [
        "Causes of power outages",
        "Renewable energy and grid reliability",
        "Challenges affecting energy infrastructure"
    ],

    "Retrieved Context Relevance": [
        "High",
        "High",
        "Moderate to High"
    ],

    "Answer Grounded in Retrieved Context": [
        "Yes",
        "Yes",
        "Yes"
    ],

    "Completeness of Response": [
        "Comprehensive",
        "Comprehensive",
        "Mostly Comprehensive"
    ],

    "Hallucination Risk": [
        "Low",
        "Low",
        "Low to Moderate"
    ],

    "Overall Assessment": [
        "Strong retrieval and grounded response",
        "Well-supported and contextually accurate",
        "Relevant retrieval with minor contextual gaps"
    ]
})

display(grounding_eval)

evaluation_df = pd.DataFrame({
    "query_id": range(1, len(queries) + 1),
    "query": queries,
    "base_llm_answer": base_llm_df["answer"].values,
    "prompt_engineered_answer": prompt_df["answer"].values,
    "base_rag_answer": rag_df["answer"].values,
    "tuned_rag_answer": tuned_rag_df["answer"].values
})

evaluation_df

evaluation_df.to_csv("evaluation_comparison_outputs.csv", index=False)
print("Evaluation comparison table saved.")

Evaluation comparison table saved.

# Comparative scoring table for the four system approaches

system_scores = pd.DataFrame({
    "System": [
        "Base LLM",
        "Prompt-Engineered LLM",
        "Base RAG",
        "Tuned RAG"
    ],
    
    "Document Grounding": [1, 1, 4, 5],
    "Answer Relevance": [3, 4, 4, 5],
    "Clarity": [3, 4, 4, 5],
    "Traceability": [1, 1, 4, 5],
    "Business Usefulness": [2, 3, 4, 5]
})

system_scores["Average Score"] = system_scores[
    [
        "Document Grounding",
        "Answer Relevance",
        "Clarity",
        "Traceability",
        "Business Usefulness"
    ]
].mean(axis=1)

system_scores

manual_scores = pd.DataFrame({
    "method": [
        "Base LLM",
        "Prompt Engineering",
        "Base RAG",
        "Tuned RAG"
    ],
    "grounding_in_documents": [2, 2, 4, 5],
    "answer_relevance": [3, 4, 4, 5],
    "clarity_and_structure": [3, 4, 4, 5],
    "traceability": [1, 1, 4, 5],
    "business_usefulness": [2, 3, 4, 5]
})

manual_scores["total_score"] = manual_scores[
    [
        "grounding_in_documents",
        "answer_relevance",
        "clarity_and_structure",
        "traceability",
        "business_usefulness"
    ]
].sum(axis=1)

manual_scores

manual_scores.sort_values("total_score", ascending=False)

per_query_evaluation = pd.DataFrame({
    "query_id": range(1, len(queries) + 1),
    "best_method": [
        "Tuned RAG",
        "Tuned RAG",
        "Base RAG",
        "Tuned RAG",
        "Tuned RAG"
    ],
    "reason": [
        "Most grounded and concise for the electricity demand question.",
        "Most aligned with retrieved LNG market context.",
        "Base RAG already captured the main coal and policy relationships well.",
        "Tuned RAG gave the clearest business-focused solar market interpretation.",
        "Tuned RAG was most stable and relevant for the SAF and refinery question."
    ]
})

per_query_evaluation

final_summary = pd.DataFrame({
    "method": manual_scores["method"],
    "total_score": manual_scores["total_score"]
}).sort_values("total_score", ascending=False)

final_summary

final_recommendation_table = pd.DataFrame({
    "approach": ["Base LLM", "Prompt Engineering", "Base RAG", "Tuned RAG"],
    "overall_result": [
        "Weakest option; ungrounded and less reliable",
        "Improved structure but still ungrounded",
        "Strong improvement through document grounding",
        "Best overall balance of grounding, clarity, and consistency"
    ],
    "recommended_for_business_use": ["No", "Limited", "Yes", "Yes - Preferred"]
})

final_recommendation_table

print("Base LLM outputs shape:", base_llm_df.shape)
print("Prompt-engineered outputs shape:", prompt_df.shape)
print("RAG outputs shape:", rag_df.shape)
print("Tuned RAG outputs shape:", tuned_rag_df.shape)
print("Evaluation table shape:", evaluation_df.shape)

Base LLM outputs shape: (5, 3)
Prompt-engineered outputs shape: (5, 3)
RAG outputs shape: (5, 4)
Tuned RAG outputs shape: (5, 4)
Evaluation table shape: (5, 6)

	query_id	query	answer
0	1	How is the rapid global expansion of artificia...	iii)
1	2	How is the unprecedented wave of new US liquef...	unanswerable
2	3	How are the surge in US electricity demand and...	Increasing in demand for electricity and the f...
3	4	How are the increasing frequency of negative w...	A lack of resources and adequate resources to ...
4	5	How do the tax credit modifications under the ...	a reduction in fuel efficiency

	query_id	query	answer
0	1	How is the rapid global expansion of artificia...	a global expansion of artificial intelligence ...
1	2	How is the unprecedented wave of new US liquef...	$90 billion
2	3	How are the surge in US electricity demand and...	energy markets, infrastructure, policy, and in...
3	4	How are the increasing frequency of negative w...	Economists are comparing energy markets to ene...
4	5	How do the tax credit modifications under the ...	impact on capacity rationalisation of traditio...

	query_id	query	answer
0	1	How is the rapid global expansion of artificia...	iii)
1	2	How is the unprecedented wave of new US liquef...	unanswerable
2	3	How are the surge in US electricity demand and...	Increasing in demand for electricity and the f...
3	4	How are the increasing frequency of negative w...	A lack of resources and adequate resources to ...
4	5	How do the tax credit modifications under the ...	a reduction in fuel efficiency

	query_id	query	base_answer	prompt_engineered_answer
0	1	How is the rapid global expansion of artificia...	iii)	a global expansion of artificial intelligence ...
1	2	How is the unprecedented wave of new US liquef...	unanswerable	$90 billion
2	3	How are the surge in US electricity demand and...	Increasing in demand for electricity and the f...	energy markets, infrastructure, policy, and in...
3	4	How are the increasing frequency of negative w...	A lack of resources and adequate resources to ...	Economists are comparing energy markets to ene...
4	5	How do the tax credit modifications under the ...	a reduction in fuel efficiency	impact on capacity rationalisation of traditio...

	query_id	query	answer	context
0	1	How is the rapid global expansion of artificia...	India, has a net electricity demand of about 4...	[Source 1] File: C:\Users\13015\Desktop\Final_...
1	2	How is the unprecedented wave of new US liquef...	a reorganization of the US-Mexico-Canada Agree...	[Source 1] File: C:\Users\13015\Desktop\Final_...
2	3	How are the surge in US electricity demand and...	a shift towards integrating biomass and hydrog...	[Source 1] File: C:\Users\13015\Desktop\Final_...
3	4	How are the increasing frequency of negative w...	0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0....	[Source 1] File: C:\Users\13015\Desktop\Final_...
4	5	How do the tax credit modifications under the ...	Using domestic versus imported feedstocks for ...	[Source 1] File: C:\Users\13015\Desktop\Final_...

Business Problem¶

Project Objective¶

Why This Problem Matters¶

Dataset Overview¶

Observations:¶

Insights¶

Note¶

2. Data Preparation for RAG¶

Insights¶

Insights¶

3. Vector Database Setup for RAG¶

Insights¶

4. Question Answering using Base LLM¶

Observations¶

5. Question Answering using LLM with Prompt Engineering¶

Observations¶

Side-by-side Comparison:¶

6. Question Answering using RAG¶

Observations¶

Debug Comparison:¶

7. Question Answering using Tuned RAG¶

Observations¶

Tuned Parameters Explanation¶

7.5 RAG Tuning Validation¶

Retrieved Context Validation¶

Grounding and Retrieval Evaluation¶

8. Output Evaluation¶

Comparative Evaluation Findings¶

Evaluation Insights¶

Evaluation Criteria Used¶

Final Evaluation Summary¶

9. Business Insights and Recommendations¶

Business Insights¶

Recommendations¶

Final Conclusion¶

Final Summary Table:¶

Final Export Check:¶

Key Takeaway¶

	query_id	query	answer	context
0	1	How is the rapid global expansion of artificia...	The rapid global expansion of artificial intel...	[Source 1] File: C:\Users\13015\Desktop\Final_...
1	2	How is the unprecedented wave of new US liquef...	a wave of new US liquefied natural gas (LNG) e...	[Source 1] File: C:\Users\13015\Desktop\Final_...
2	3	How are the surge in US electricity demand and...	The surge in US electricity demand and the 202...	[Source 1] File: C:\Users\13015\Desktop\Final_...
3	4	How are the increasing frequency of negative w...	Increasing frequency of negative wholesale ele...	[Source 1] File: C:\Users\13015\Desktop\Final_...
4	5	How do the tax credit modifications under the ...	The 'One Big Beautiful Bill Act' (OBBBA) affec...	[Source 1] File: C:\Users\13015\Desktop\Final_...

	Configuration	Advantages	Disadvantages	Observed Performance	Final Assessment
0	Top-k = 2	Highly focused retrieval with minimal irreleva...	Sometimes missed supporting details	Good precision but incomplete answers	Not selected
1	Top-k = 3	Balanced retrieval with strong relevance and s...	Occasional minor redundancy	Best overall balance of precision and complete...	Selected
2	Top-k = 4	More contextual coverage for complex questions	Increased irrelevant context	Moderate improvement in completeness but reduc...	Partially effective
3	Top-k = 5	Maximum information retrieval	Higher noise and reduced answer focus	Context overload and weaker response quality	Not recommended

	Query	Retrieved Context Relevance	Answer Grounded in Retrieved Context	Completeness of Response	Hallucination Risk	Overall Assessment
0	Causes of power outages	High	Yes	Comprehensive	Low	Strong retrieval and grounded response
1	Renewable energy and grid reliability	High	Yes	Comprehensive	Low	Well-supported and contextually accurate
2	Challenges affecting energy infrastructure	Moderate to High	Yes	Mostly Comprehensive	Low to Moderate	Relevant retrieval with minor contextual gaps

	System	Document Grounding	Answer Relevance	Clarity	Traceability	Business Usefulness	Average Score
0	Base LLM	1	3	3	1	2	2.0
1	Prompt-Engineered LLM	1	4	4	1	3	2.6
2	Base RAG	4	4	4	4	4	4.0
3	Tuned RAG	5	5	5	5	5	5.0

	method	grounding_in_documents	answer_relevance	clarity_and_structure	traceability	business_usefulness	total_score
0	Base LLM	2	3	3	1	2	11
1	Prompt Engineering	2	4	4	1	3	14
2	Base RAG	4	4	4	4	4	20
3	Tuned RAG	5	5	5	5	5	25

	query_id	best_method	reason
0	1	Tuned RAG	Most grounded and concise for the electricity ...
1	2	Tuned RAG	Most aligned with retrieved LNG market context.
2	3	Base RAG	Base RAG already captured the main coal and po...
3	4	Tuned RAG	Tuned RAG gave the clearest business-focused s...
4	5	Tuned RAG	Tuned RAG was most stable and relevant for the...

	approach	overall_result	recommended_for_business_use
0	Base LLM	Weakest option; ungrounded and less reliable	No
1	Prompt Engineering	Improved structure but still ungrounded	Limited
2	Base RAG	Strong improvement through document grounding	Yes
3	Tuned RAG	Best overall balance of grounding, clarity, an...	Yes - Preferred