From ba77670f15a150b613c2a226de748d4f3092af77 Mon Sep 17 00:00:00 2001 From: Govindh Kishore Date: Wed, 11 Mar 2026 02:05:15 +0530 Subject: [PATCH] feat(retriever): add token-aware context truncation to HybridRetriever --- config_default.yml | 5 +++++ src/retrievers/csv_chroma.py | 5 +++-- src/util/context_truncator.py | 32 ++++++++++++++++++++++++++++++++ 3 files changed, 40 insertions(+), 2 deletions(-) create mode 100644 src/util/context_truncator.py diff --git a/config_default.yml b/config_default.yml index e53055a..d2e97f5 100644 --- a/config_default.yml +++ b/config_default.yml @@ -3,6 +3,11 @@ profiles: - React-to-Me +retriever: + context_truncation: + max_docs: 15 + max_tokens: 12000 + features: postprocessing: # external web search feature enabled: true diff --git a/src/retrievers/csv_chroma.py b/src/retrievers/csv_chroma.py index a792c93..3aec16c 100644 --- a/src/retrievers/csv_chroma.py +++ b/src/retrievers/csv_chroma.py @@ -17,6 +17,7 @@ from nltk.tokenize import word_tokenize from pydantic import AfterValidator, Field from pydantic.json_schema import SkipJsonSchema +from util.context_truncator import truncate_to_token_limit chroma_settings = chromadb.config.Settings(anonymized_telemetry=False) @@ -179,7 +180,7 @@ def retrieve_documents(self, queries: list[str], run_manager) -> list[Document]: ) doc_lists.append(bm25_docs + vector_docs) subdirectory_docs.extend(self.weighted_reciprocal_rank(doc_lists)) - return subdirectory_docs + return truncate_to_token_limit(subdirectory_docs) async def aretrieve_documents( self, queries: list[str], run_manager @@ -219,4 +220,4 @@ async def aretrieve_documents( for bm25_results, vector_results in zip(results_iter, results_iter) ] subdirectory_docs.extend(self.weighted_reciprocal_rank(doc_lists)) - return subdirectory_docs + return truncate_to_token_limit(subdirectory_docs) diff --git a/src/util/context_truncator.py b/src/util/context_truncator.py new file mode 100644 index 0000000..0876f49 --- /dev/null +++ b/src/util/context_truncator.py @@ -0,0 +1,32 @@ +from langchain_core.documents import Document +import tiktoken + + +def truncate_to_token_limit( + docs: list[Document], + max_docs: int = 15, + max_tokens: int = 12000, + model: str = "gpt-4o", +) -> list[Document]: + """ + Truncate document list to fit within token and count budgets. + Docs must already be ranked from best to worst (e.g. WRR). + Cuts from the bottom so least relevant docs are removed first. + """ + encoder = tiktoken.encoding_for_model(model) + result = [] + total_tokens = 0 + + for doc in docs[:max_docs]: + doc_tokens = len(encoder.encode(doc.page_content)) + + if total_tokens + doc_tokens > max_tokens: + # always include at least one doc even if it exceeds budget + if not result: + result.append(doc) + break + + result.append(doc) + total_tokens += doc_tokens + + return result \ No newline at end of file