In [432]:
from dotenv import load_dotenv
import os

# Load environment variables from the .env file
load_dotenv()
api_key = os.getenv("GEMINI_API_KEY")


In [433]:
from google import genai
from google.generativeai import configure, GenerativeModel
client = genai.Client(api_key=api_key)
llm = GenerativeModel(model_name="gemini-2.5-flash")


In [434]:
from langchain_google_genai import ChatGoogleGenerativeAI

llm = ChatGoogleGenerativeAI(
    model="gemini-2.5-flash",
    google_api_key=api_key,
    temperature=0.0
)


In [435]:
# Part 2: Document Loading
from langchain_classic.document_loaders import TextLoader
from langchain_classic.text_splitter import RecursiveCharacterTextSplitter
from langchain_classic.vectorstores import Chroma
from langchain_classic.embeddings import SentenceTransformerEmbeddings
import requests
import tempfile
import os
def load_microsoft_10k(data_path):
    """
    Load Microsoft 10-K report from SEC website
    """


    # Microsoft 10-K URL


    loader = TextLoader(data_path, encoding='utf-8')
    documents = loader.load()
    print("Loaded from local file")
    return documents
# Load the document
documents = load_microsoft_10k("data.txt")

Loaded from local file


In [436]:
print(documents)

[Document(metadata={'source': 'data.txt'}, page_content='UNITED STATES\nSECURITIES AND EXCHANGE COMMISSION\nWashington, D.C. 20549\nFORM 10-K\n☒\nANNUAL REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934\nFor the Fiscal Year Ended June 30, 2022\nOR\n☐\nTRANSITION REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934\nFor the Transition Period From                  to\nCommission File Number 001-37845\nMICROSOFT CORPORATION\nWashington\n91-1144442\n(STATE OF INCORPORATION)\n(I.R.S. ID)\nONE MICROSOFT WAY, REDMOND, Washington 98052-6399\n(425) 882-8080\nwww.microsoft.com/investor\nSecurities registered pursuant to Section 12(b) of the Act:\nTitle of each class\nTrading Symbol\nName of exchange on which registered\nCommon stock, $0.00000625 par value per share\nMSFT\nNasdaq\n3.125% Notes due 2028\nMSFT\nNasdaq\n2.625% Notes due 2033\nMSFT\nNasdaq\nSecurities registered pursuant to Section 12(g) of the Act:\nNone\nIndicate by check mark if

In [437]:
def Chunking(CHUNK_SIZE,CHUNK_OVERLAP,documents):

    # Part 2: Document Chunking
    print("Chunking Document")

    # Chunking Configuration


    print(f"Chunking Parameters:")
    print(f" Chunk Size: {CHUNK_SIZE} characters")
    print(f"Chunk Overlap: {CHUNK_OVERLAP} characters")
    print(f"Rationale for these parameters:")
    print(f"1000 chars: Good balance - enough context, not too long for embeddings")
    print(f"200 overlap: Prevents loss of context at chunk boundaries")
    print(f" Suitable for financial documents with dense information")

    # Initialize text splitter
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=CHUNK_SIZE,
        chunk_overlap=CHUNK_OVERLAP,
        length_function=len,
        separators=["\n\n", "\n", ".", "!", "?", ",", " ", ""]
    )

    # Split the document
    chunks = text_splitter.split_documents(documents)

    print(f"Document chunked successfully!")
    print(f"Created {len(chunks)} chunks")
    print(f"Sample chunk preview:")
    print(f"Length: {len(chunks[0].page_content)} characters")
    print(f"Content: {chunks[0].page_content[:200]}...")

    return chunks

In [438]:
CHUNK_SIZE = 1000      # Number of characters per chunk
CHUNK_OVERLAP = 200   # Overlap between chunks
chunks =Chunking(CHUNK_SIZE,CHUNK_OVERLAP,documents)

Chunking Document
Chunking Parameters:
 Chunk Size: 1000 characters
Chunk Overlap: 200 characters
Rationale for these parameters:
1000 chars: Good balance - enough context, not too long for embeddings
200 overlap: Prevents loss of context at chunk boundaries
 Suitable for financial documents with dense information
Document chunked successfully!
Created 479 chunks
Sample chunk preview:
Length: 882 characters
Content: UNITED STATES
SECURITIES AND EXCHANGE COMMISSION
Washington, D.C. 20549
FORM 10-K
☒
ANNUAL REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934
For the Fiscal Year Ended June ...


In [439]:
# Install dependencies (if not installed)
# !pip install chromadb sentence-transformers google-generativeai

import chromadb
from sentence_transformers import SentenceTransformer
import google.generativeai as genai

# --- Step 1: Initialize Chroma Client ---
client = chromadb.Client()

# Create (or get) a collection
collection = client.get_or_create_collection(name="chunked_data")
print("✅ Chroma collection ready!")

# --- Step 2: Initialize Embedding Model ---


def embed_chunk(text):
    """Convert text chunk into embedding vector"""
    return embedder.encode(text).tolist()







✅ Chroma collection ready!


In [440]:
# If your chunks are Document objects, do this:
texts = [chunk.page_content for chunk in chunks]

# Optional: metadata (if exists)
metadatas = [chunk.metadata for chunk in chunks]

In [441]:

for i, text in enumerate(texts):
    collection.add(
        documents=[text],
        metadatas=[metadatas[i]],
        ids=[str(i)],
        embeddings=[embed_chunk(text)]
    )
print("✅ Data added to Chroma!")

✅ Data added to Chroma!


In [442]:
# ----------------------------------------------------------
# STEP 4: BUILD BASELINE RETRIEVAL-QA CHAIN (fixed version)
# ----------------------------------------------------------
from langchain_community.vectorstores import Chroma
from langchain.chains import RetrievalQA
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain.embeddings.base import Embeddings
from langchain_community.embeddings import SentenceTransformerEmbeddings
# Wrap your local SentenceTransformer in a class LangChain can use
class LocalEmbeddingFunction(Embeddings):
    def __init__(self, model):
        self.model = model

    def embed_documents(self, texts):
        """Used for indexing documents."""
        return self.model.encode(texts).tolist()

    def embed_query(self, text):
        """Used for embedding search queries."""
        return self.model.encode([text])[0].tolist()


embeddings = SentenceTransformerEmbeddings(
    model_name="./all-MiniLM-L6-v2"
)
# Create embedding wrapper
embedding= LocalEmbeddingFunction(embeddings)

# Connect to Chroma collection (already created earlier)
vectordb = Chroma(
    client=client,
    collection_name="chunked_data",  # replace with your actual collection name
    embedding_function=embedding
)

# Create retriever
retriever = vectordb.as_retriever(search_kwargs={"k": 3})




# Build RetrievalQA chain
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=retriever,
    return_source_documents=True
)

print("✅ RetrievalQA chain created successfully!")


✅ RetrievalQA chain created successfully!


In [443]:
query = """
 extract the company's total revenue for the financial year ended June 30, 2022.

"""
query_embedding = embeddings.embed_query(query)
results= collection.query(query_embeddings=[query_embedding])


In [444]:
# Assume results is what you got from Chroma query
# Example: results['documents'] -> [['text1', 'text2', ...]]
def clean(results):
    cleaned_docs = []

    for doc_list in results['documents']:
        for doc in doc_list:
            # Remove newlines and extra spaces
            cleaned_text = " ".join(doc.split())
            cleaned_docs.append(cleaned_text)

    # Now cleaned_docs is a list of cleaned strings
    for i, doc in enumerate(cleaned_docs):
        print(f"Document {i+1}:\n{doc}\n")


In [445]:
clean(results)

Document 1:
Shares Amount Year Ended June 30, 2022 2021 2020 First Quarter 21 $ 6,200 25 $ 5,270 29 $ 4,000 Second Quarter 20 6,233 27 5,750 32 4,600 Third Quarter 26 7,800 25 5,750 37 6,000 Fourth Quarter 28 7,800 24 6,200 28 5,088 Total 95 $ 28,033 101 $ 22,970 126 $ 19,688

Document 2:
Shares Amount Year Ended June 30, 2022 2021 2020 First Quarter 21 $ 6,200 25 $ 5,270 29 $ 4,000 Second Quarter 20 6,233 27 5,750 32 4,600 Third Quarter 26 7,800 25 5,750 37 6,000 Fourth Quarter 28 7,800 24 6,200 28 5,088 Total 95 $ 28,033 101 $ 22,970 126 $ 19,688

Document 3:
NOTE 13 — UNEARNED REVENUE Unearned revenue by segment was as follows: (In millions) June 30, 2022 2021 Productivity and Business Processes $ 24,558 $ 22,120 Intelligent Cloud 19,371 17,710 More Personal Computing 4,479 4,311 Total $ 48,408 $ 44,141 Changes in unearned revenue were as follows: (In millions) Year Ended June 30, 2022 Balance, beginning of period $ 44,141 Deferral of revenue 110,455 Recognition of unearned revenue 

In [446]:

querys = ["What is the name of the company's independent registered public accounting firm?","Who are the key executive officers of the company as listed in the report?","Summarize the key risks related to competition mentioned in the report.","What does the report say about 'Azure'?","What does the report mention about sustainability or environmental responsibility?"
]


In [447]:
for i in querys :
    query_embedding = embeddings.embed_query(i)
    results= collection.query(query_embeddings=[query_embedding])
    print(f"Qustion :{i}")
    print(f" Answer :")
    clean(results)

Qustion :What is the name of the company's independent registered public accounting firm?
 Answer :
Document 1:
Item 9A REPORT OF INDEPENDENT REGISTERED PUBLIC ACCOUNTING FIRM To the Stockholders and the Board of Directors of Microsoft Corporation Opinion on Internal Control over Financial Reporting We have audited the internal control over financial reporting of Microsoft Corporation and subsidiaries (the "Company") as of June 30, 2022, based on criteria established in Internal Control — Integrated Framework (2013) issued by the Committee of Sponsoring Organizations of the Treadway Commission (COSO). In our opinion, the Company maintained, in all material respects, effective internal control over financial reporting as of June 30, 2022, based on criteria established in Internal Control — Integrated Framework (2013) issued by COSO.

Document 2:
Item 9A REPORT OF INDEPENDENT REGISTERED PUBLIC ACCOUNTING FIRM To the Stockholders and the Board of Directors of Microsoft Corporation Opinion

###### the answer to the given qustions are almost correct.Chunking helped to split the document in smaller ones which is managable and proved processing efficiency.I faced many issues becaus eof the package and other compactable versions atlast i tried with the manual encoders without directly implementing the advanced techniques

## ADVANCE RAG

In [448]:
from sentence_transformers import SentenceTransformer, util

# Use a semantic model for reranking
reranker_model = SentenceTransformer("all-MiniLM-L6-v2")

def rerank(query, docs, top_k=3):
    """
    Re-rank documents based on similarity to the query.
    Args:
        query (str): The question/query
        docs (list of str): List of document texts
        top_k (int): Number of top documents to keep
    Returns:
        List of top_k documents (strings)
    """
    # Compute embeddings
    query_emb = reranker_model.encode(query, convert_to_tensor=True)
    docs_emb = reranker_model.encode(docs, convert_to_tensor=True)

    # Compute cosine similarity
    scores = util.cos_sim(query_emb, docs_emb)[0]

    # Sort and take top_k
    top_indices = scores.topk(top_k).indices
    return [docs[i] for i in top_indices]


In [449]:
# Wrap the chain with reranking
def retrieval_qa_with_reranker(query):
    # Step 1: Retrieve documents

    query_embedding = embeddings.embed_query(query)
    results= collection.query(query_embeddings=[query_embedding])
    docs=clean(results)



    # Step 2: Extract text
    doc_texts = [doc.page_content for doc in docs]



    # Step 3: Rerank
    top_docs = rerank(query, doc_texts, top_k=3)

    # Step 4: Feed reranked docs to LLM
    answer = qa_chain.llm_chain.run({
        "input_documents": top_docs,
        "question": query
    })

    return answer


In [450]:
query = "What are the key risks related to competition mentioned in the report?"
answer = retrieval_qa_with_reranker(query)
print(answer)


Document 1:
30 PART I Item 1A Government regulatory actions and court decisions such as these may result in fines or hinder our ability to provide the benefits of our software to consumers and businesses, reducing the attractiveness of our products and the revenue that comes from them. New competition law actions could be initiated, potentially using previous actions as precedent. The outcome of such actions, or steps taken to avoid them, could adversely affect us in a variety of ways, including: • We may have to choose between withdrawing products from certain geographies to avoid fines or designing and developing alternative versions of those products to comply with government rulings, which may entail a delay in a product release and removing functionality that customers want or on which developers rely. •

Document 2:
30 PART I Item 1A Government regulatory actions and court decisions such as these may result in fines or hinder our ability to provide the benefits of our software to

TypeError: 'NoneType' object is not iterable

In [430]:
for i in querys:
    answer = retrieval_qa_with_reranker(i)
    print(answer)

Document 1:
Item 9A REPORT OF INDEPENDENT REGISTERED PUBLIC ACCOUNTING FIRM To the Stockholders and the Board of Directors of Microsoft Corporation Opinion on Internal Control over Financial Reporting We have audited the internal control over financial reporting of Microsoft Corporation and subsidiaries (the "Company") as of June 30, 2022, based on criteria established in Internal Control — Integrated Framework (2013) issued by the Committee of Sponsoring Organizations of the Treadway Commission (COSO). In our opinion, the Company maintained, in all material respects, effective internal control over financial reporting as of June 30, 2022, based on criteria established in Internal Control — Integrated Framework (2013) issued by COSO.

Document 2:
Item 9A REPORT OF INDEPENDENT REGISTERED PUBLIC ACCOUNTING FIRM To the Stockholders and the Board of Directors of Microsoft Corporation Opinion on Internal Control over Financial Reporting We have audited the internal control over financial re

TypeError: 'NoneType' object is not iterable

##### Adding a reranker almost always improves the results of a Retrieval-Augmented Generation (RAG) system, often significantly. While a basic RAG system uses an embedding model for an initial retrieval of documents based on semantic similarity, this can sometimes fail to surface the most relevant information.So the Advance Rag with the rerqking makes the answer to be more relevant that the Rag

#### As my first experiance in creatig rag on dense data it was hard for me to  ensuring the retriever actually returns the most relevant, meaningful chunks from dense documents.
#### If your chunk size is too small → you lose meaning.

#### If too large → retrieval becomes fuzzy and memory-heavy