Spaces:
Paused
Paused
eh
Browse files
app.py
CHANGED
|
@@ -3,15 +3,32 @@ import pandas as pd
|
|
| 3 |
import gradio as gr
|
| 4 |
from typing import Dict, Any, Type
|
| 5 |
from web2json.preprocessor import BasicPreprocessor
|
| 6 |
-
from web2json.ai_extractor import AIExtractor,LLMClassifierExtractor,NvidiaLLMClient
|
| 7 |
from web2json.postprocessor import PostProcessor
|
| 8 |
from web2json.pipeline import Pipeline
|
| 9 |
from pydantic import BaseModel, Field, create_model
|
| 10 |
import os
|
| 11 |
import dotenv
|
|
|
|
|
|
|
|
|
|
| 12 |
|
| 13 |
dotenv.load_dotenv()
|
| 14 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 15 |
def parse_schema_input(schema_input: str) -> Type[BaseModel]:
|
| 16 |
"""
|
| 17 |
Convert user schema input to a Pydantic BaseModel.
|
|
@@ -170,66 +187,19 @@ def webpage_to_json(content: str, is_url: bool, schema: BaseModel) -> Dict[str,
|
|
| 170 |
- Preserve the original formatting and context where relevant
|
| 171 |
- Return the extracted data in the format specified by the schema"""
|
| 172 |
|
| 173 |
-
classification_prompt_template =
|
| 174 |
-
# HTML Chunk Relevance Classification Prompt
|
| 175 |
-
|
| 176 |
-
You are an HTML content classifier. Your task is to analyze an HTML chunk against a given schema and determine if the content is relevant.
|
| 177 |
-
|
| 178 |
-
## Instructions:
|
| 179 |
-
1. Carefully examine the provided HTML chunk
|
| 180 |
-
2. Compare it against the given schema/criteria
|
| 181 |
-
3. Determine if the HTML chunk contains content that matches or is relevant to the schema
|
| 182 |
-
4. Respond with ONLY a JSON object containing a single field "relevant" with value 1 (relevant) or 0 (not relevant)
|
| 183 |
-
|
| 184 |
-
## Input Format:
|
| 185 |
-
**Schema/Criteria:**
|
| 186 |
-
{schema}
|
| 187 |
-
|
| 188 |
-
**HTML Chunk:**
|
| 189 |
-
```html
|
| 190 |
-
{content}
|
| 191 |
-
```
|
| 192 |
-
|
| 193 |
-
## Output Format:
|
| 194 |
-
Your response must be ONLY a valid JSON object with no additional text:
|
| 195 |
-
|
| 196 |
-
```json
|
| 197 |
-
{{
|
| 198 |
-
"relevant": 1
|
| 199 |
-
}}
|
| 200 |
-
```
|
| 201 |
-
|
| 202 |
-
OR
|
| 203 |
-
|
| 204 |
-
```json
|
| 205 |
-
{{
|
| 206 |
-
"relevant": 0
|
| 207 |
-
}}
|
| 208 |
-
```
|
| 209 |
-
|
| 210 |
-
## Classification Rules:
|
| 211 |
-
- Output 1 if the HTML chunk contains content that matches the schema criteria
|
| 212 |
-
- Output 0 if the HTML chunk does not contain relevant content
|
| 213 |
-
- Consider semantic meaning, not just exact keyword matches
|
| 214 |
-
- Look at text content, attributes, structure, and context
|
| 215 |
-
- Ignore purely structural HTML elements (like divs, spans) unless they contain relevant content
|
| 216 |
-
- Be STRICT in your evaluation - only mark as relevant (1) if there is clear, meaningful content that directly relates to the schema
|
| 217 |
-
- Empty elements, placeholder text, navigation menus, headers/footers, and generic UI components should typically be marked as not relevant (0)
|
| 218 |
-
- The HTML chunk does not need to contain ALL schema information, but it must contain SUBSTANTIAL and SPECIFIC content related to the schema
|
| 219 |
-
|
| 220 |
-
CRITICAL: Your entire response MUST be exactly one JSON object. DO NOT include any explanations, reasoning, markdown formatting, code blocks, or additional text. Output ONLY the raw JSON object.
|
| 221 |
-
"""
|
| 222 |
# Initialize pipeline components
|
| 223 |
# TODO: improve the RAG system and optimize (don't instantiate every time)
|
| 224 |
preprocessor = BasicPreprocessor(config={'keep_tags': True})
|
| 225 |
try:
|
| 226 |
# llm = GeminiLLMClient(config={'api_key': os.getenv('GEMINI_API_KEY')})
|
| 227 |
llm = NvidiaLLMClient(config={'api_key': os.getenv('NVIDIA_API_KEY'),'model_name': 'qwen/qwen2.5-7b-instruct'})
|
|
|
|
| 228 |
except Exception as e:
|
| 229 |
return {"error": f"Failed to initialize LLM client: {str(e)}"}
|
| 230 |
|
| 231 |
# ai_extractor = RAGExtractor(llm_client=llm, prompt_template=prompt_template)
|
| 232 |
-
ai_extractor = LLMClassifierExtractor(llm_client=llm, prompt_template=prompt_template, classifier_prompt=classification_prompt_template)
|
| 233 |
postprocessor = PostProcessor()
|
| 234 |
pipeline = Pipeline(preprocessor, ai_extractor, postprocessor)
|
| 235 |
|
|
|
|
| 3 |
import gradio as gr
|
| 4 |
from typing import Dict, Any, Type
|
| 5 |
from web2json.preprocessor import BasicPreprocessor
|
| 6 |
+
from web2json.ai_extractor import AIExtractor,LLMClassifierExtractor,NvidiaLLMClient, NvidiaRerankerClient
|
| 7 |
from web2json.postprocessor import PostProcessor
|
| 8 |
from web2json.pipeline import Pipeline
|
| 9 |
from pydantic import BaseModel, Field, create_model
|
| 10 |
import os
|
| 11 |
import dotenv
|
| 12 |
+
import random
|
| 13 |
+
import numpy as np
|
| 14 |
+
import torch
|
| 15 |
|
| 16 |
dotenv.load_dotenv()
|
| 17 |
|
| 18 |
+
def seed_everything(seed=42):
|
| 19 |
+
random.seed(seed)
|
| 20 |
+
np.random.seed(seed)
|
| 21 |
+
torch.manual_seed(seed)
|
| 22 |
+
|
| 23 |
+
if torch.cuda.is_available():
|
| 24 |
+
torch.cuda.manual_seed(seed)
|
| 25 |
+
torch.cuda.manual_seed_all(seed) # if using multi-GPU
|
| 26 |
+
|
| 27 |
+
torch.backends.cudnn.deterministic = True
|
| 28 |
+
torch.backends.cudnn.benchmark = False
|
| 29 |
+
|
| 30 |
+
seed_everything(22)
|
| 31 |
+
|
| 32 |
def parse_schema_input(schema_input: str) -> Type[BaseModel]:
|
| 33 |
"""
|
| 34 |
Convert user schema input to a Pydantic BaseModel.
|
|
|
|
| 187 |
- Preserve the original formatting and context where relevant
|
| 188 |
- Return the extracted data in the format specified by the schema"""
|
| 189 |
|
| 190 |
+
classification_prompt_template = schema.model_json_schema()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 191 |
# Initialize pipeline components
|
| 192 |
# TODO: improve the RAG system and optimize (don't instantiate every time)
|
| 193 |
preprocessor = BasicPreprocessor(config={'keep_tags': True})
|
| 194 |
try:
|
| 195 |
# llm = GeminiLLMClient(config={'api_key': os.getenv('GEMINI_API_KEY')})
|
| 196 |
llm = NvidiaLLMClient(config={'api_key': os.getenv('NVIDIA_API_KEY'),'model_name': 'qwen/qwen2.5-7b-instruct'})
|
| 197 |
+
reranker = NvidiaRerankerClient(config={'api_key': os.getenv('NVIDIA_API_KEY'),'model_name': 'nv-rerank-qa-mistral-4b:1'})
|
| 198 |
except Exception as e:
|
| 199 |
return {"error": f"Failed to initialize LLM client: {str(e)}"}
|
| 200 |
|
| 201 |
# ai_extractor = RAGExtractor(llm_client=llm, prompt_template=prompt_template)
|
| 202 |
+
ai_extractor = LLMClassifierExtractor(reranker=reranker, llm_client=llm, prompt_template=prompt_template, classifier_prompt=classification_prompt_template)
|
| 203 |
postprocessor = PostProcessor()
|
| 204 |
pipeline = Pipeline(preprocessor, ai_extractor, postprocessor)
|
| 205 |
|
test.ipynb
DELETED
|
The diff for this file is too large to render.
See raw diff
|
|
|
web2json/__pycache__/ai_extractor.cpython-311.pyc
CHANGED
|
Binary files a/web2json/__pycache__/ai_extractor.cpython-311.pyc and b/web2json/__pycache__/ai_extractor.cpython-311.pyc differ
|
|
|
web2json/__pycache__/pipeline.cpython-311.pyc
CHANGED
|
Binary files a/web2json/__pycache__/pipeline.cpython-311.pyc and b/web2json/__pycache__/pipeline.cpython-311.pyc differ
|
|
|
web2json/__pycache__/postprocessor.cpython-311.pyc
CHANGED
|
Binary files a/web2json/__pycache__/postprocessor.cpython-311.pyc and b/web2json/__pycache__/postprocessor.cpython-311.pyc differ
|
|
|
web2json/__pycache__/preprocessor.cpython-311.pyc
CHANGED
|
Binary files a/web2json/__pycache__/preprocessor.cpython-311.pyc and b/web2json/__pycache__/preprocessor.cpython-311.pyc differ
|
|
|
web2json/ai_extractor.py
CHANGED
|
@@ -232,6 +232,7 @@ class NvidiaLLMClient(LLMClient):
|
|
| 232 |
Returns:
|
| 233 |
str: The generated text from the NVIDIA API.
|
| 234 |
"""
|
|
|
|
| 235 |
response = self.client.chat.completions.create(
|
| 236 |
model=self.model_name,
|
| 237 |
messages=[{"role": "user", "content": prompt}],
|
|
@@ -286,50 +287,38 @@ class NvidiaRerankerClient(RerankerClient):
|
|
| 286 |
self.model_name = model_name
|
| 287 |
|
| 288 |
@retry_on_ratelimit(max_retries=6, base_delay=0.5, max_delay=5.0)
|
| 289 |
-
def rerank(self, query: str, passages: List[str], top_k: int = 3
|
| 290 |
# 1. Prepare and send documents for scoring
|
| 291 |
docs = [Document(page_content=p) for p in passages]
|
| 292 |
-
# print("Bonjour")
|
| 293 |
-
# print(type(docs),docs)
|
| 294 |
-
# print(type(query),query)
|
| 295 |
scored_docs = self.client.compress_documents(
|
| 296 |
query=str(query),
|
| 297 |
documents=docs
|
| 298 |
)
|
| 299 |
-
|
| 300 |
-
# 2. Extract raw scores
|
| 301 |
raw_scores = np.array([doc.metadata['relevance_score'] for doc in scored_docs], dtype=float)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 302 |
|
| 303 |
-
# 3. Softmax normalization
|
| 304 |
-
exp_scores = np.exp(raw_scores - np.max(raw_scores))
|
| 305 |
-
softmax_scores = exp_scores / exp_scores.sum()
|
| 306 |
|
| 307 |
-
# 4. (Optional) Min–Max rescale of the softmax outputs
|
| 308 |
-
min_val, max_val = raw_scores.min(), raw_scores.max()
|
| 309 |
-
if max_val > min_val:
|
| 310 |
-
minmax_scores = (raw_scores - min_val) / (max_val - min_val)
|
| 311 |
-
else:
|
| 312 |
-
# all scores equal → set them all to 1
|
| 313 |
-
minmax_scores = np.ones_like(raw_scores)
|
| 314 |
-
|
| 315 |
-
# 5. Attach new scores back to metadata
|
| 316 |
-
for doc, s, mm in zip(scored_docs, softmax_scores, minmax_scores):
|
| 317 |
-
doc.metadata['softmax_score'] = float(s)
|
| 318 |
-
doc.metadata['minmax_score'] = float(mm)
|
| 319 |
-
|
| 320 |
-
# 6. Sort and return top_k by softmax_score
|
| 321 |
-
# Sort by softmax_score descending
|
| 322 |
-
sorted_docs = sorted(
|
| 323 |
-
scored_docs,
|
| 324 |
-
key=lambda d: d.metadata['softmax_score'],
|
| 325 |
-
reverse=True
|
| 326 |
-
)
|
| 327 |
-
# print("Ayeeeee")
|
| 328 |
-
# print("Docs Value:",sorted_docs)
|
| 329 |
-
# Filter by threshold
|
| 330 |
-
filtered_docs = [doc for doc in sorted_docs if doc.metadata['minmax_score'] >= threshold]
|
| 331 |
-
# print("Final", filtered_docs)
|
| 332 |
-
return filtered_docs
|
| 333 |
|
| 334 |
|
| 335 |
# TODO: will I need it ?
|
|
@@ -353,32 +342,56 @@ class HFRerankerClient(LLMClient):
|
|
| 353 |
self.token_true_id = self.tokenizer.convert_tokens_to_ids("yes")
|
| 354 |
self.token_false_id = self.tokenizer.convert_tokens_to_ids("no")
|
| 355 |
|
| 356 |
-
def rerank(self, query: str, passages: List[str], top_k: int = 3) -> List[str]:
|
| 357 |
"""
|
| 358 |
-
Rerank passages based on relevance to query.
|
| 359 |
|
| 360 |
Args:
|
| 361 |
query (str): Query string.
|
| 362 |
passages (List[str]): List of passages.
|
| 363 |
top_k (int): Number of top passages to return.
|
|
|
|
| 364 |
|
| 365 |
Returns:
|
| 366 |
-
List[str]: Top-k most relevant passages.
|
| 367 |
"""
|
| 368 |
-
inputs = [
|
|
|
|
|
|
|
|
|
|
| 369 |
scores = []
|
| 370 |
|
| 371 |
with torch.no_grad():
|
| 372 |
for inp in inputs:
|
| 373 |
logits = self.model(**inp).logits
|
|
|
|
| 374 |
score = torch.softmax(logits, dim=1)[0, 1].item() # probability of relevance
|
| 375 |
-
scores.append(score)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 376 |
|
| 377 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 378 |
|
| 379 |
-
top_indices = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)[:top_k]
|
| 380 |
-
print(f"top indices: {top_indices}")
|
| 381 |
-
return [passages[i] for i in top_indices]
|
| 382 |
|
| 383 |
@retry_on_ratelimit(max_retries=6, base_delay=0.5, max_delay=5.0)
|
| 384 |
def call_api(self, prompt: str) -> str:
|
|
@@ -457,6 +470,7 @@ class LLMClassifierExtractor(AIExtractor):
|
|
| 457 |
# print("Using Hugging Face reranker for classification.")
|
| 458 |
return self.reranker.rerank(query, passages, top_k=top_k)
|
| 459 |
response = self.reranker.rerank(query,passages)
|
|
|
|
| 460 |
# print("DONNNNE")
|
| 461 |
# NVIDIA reranker path
|
| 462 |
return response
|
|
@@ -476,7 +490,8 @@ class LLMClassifierExtractor(AIExtractor):
|
|
| 476 |
# print(f"Content successfully chunked: {chunks}")
|
| 477 |
classified_chunks = self.classify_chunks(chunks, hf=hf) # conditional reranker
|
| 478 |
# extracting the content
|
| 479 |
-
|
|
|
|
| 480 |
# print(f"Classified Chunks {len(classified_chunks)}")
|
| 481 |
# print(classified_chunks)
|
| 482 |
# print('='*80)
|
|
|
|
| 232 |
Returns:
|
| 233 |
str: The generated text from the NVIDIA API.
|
| 234 |
"""
|
| 235 |
+
print("prompt: ", prompt)
|
| 236 |
response = self.client.chat.completions.create(
|
| 237 |
model=self.model_name,
|
| 238 |
messages=[{"role": "user", "content": prompt}],
|
|
|
|
| 287 |
self.model_name = model_name
|
| 288 |
|
| 289 |
@retry_on_ratelimit(max_retries=6, base_delay=0.5, max_delay=5.0)
|
| 290 |
+
def rerank(self, query: str, passages: List[str], top_k: int = 3, threshold: float = 0.5) -> List[Document]:
|
| 291 |
# 1. Prepare and send documents for scoring
|
| 292 |
docs = [Document(page_content=p) for p in passages]
|
|
|
|
|
|
|
|
|
|
| 293 |
scored_docs = self.client.compress_documents(
|
| 294 |
query=str(query),
|
| 295 |
documents=docs
|
| 296 |
)
|
| 297 |
+
|
| 298 |
+
# 2. Extract raw scores and compute sigmoid probabilities
|
| 299 |
raw_scores = np.array([doc.metadata['relevance_score'] for doc in scored_docs], dtype=float)
|
| 300 |
+
print(f"raw scores {raw_scores}")
|
| 301 |
+
p_scores = 1 / (1 + np.exp(-raw_scores))
|
| 302 |
+
print(f"Sigmoid scores: {p_scores}")
|
| 303 |
+
|
| 304 |
+
# 3. Min-max normalization
|
| 305 |
+
min_score = np.min(p_scores)
|
| 306 |
+
max_score = np.max(p_scores)
|
| 307 |
+
if max_score == min_score:
|
| 308 |
+
norm_scores = np.ones_like(p_scores) # All values same — normalize to 1
|
| 309 |
+
else:
|
| 310 |
+
norm_scores = (p_scores - min_score) / (max_score - min_score)
|
| 311 |
+
print(f"Normalized scores: {norm_scores}")
|
| 312 |
+
|
| 313 |
+
# 4. Filter by threshold using normalized scores
|
| 314 |
+
scored_pairs = [(doc, norm) for doc, norm in zip(scored_docs, norm_scores) if norm > threshold]
|
| 315 |
+
print(f"Filtered pairs:\n{scored_pairs}")
|
| 316 |
+
|
| 317 |
+
# 5. Return top_k documents (already sorted by model, no need to re-sort)
|
| 318 |
+
top_docs = [doc.page_content for doc, _ in scored_pairs]
|
| 319 |
+
return top_docs
|
| 320 |
|
|
|
|
|
|
|
|
|
|
| 321 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 322 |
|
| 323 |
|
| 324 |
# TODO: will I need it ?
|
|
|
|
| 342 |
self.token_true_id = self.tokenizer.convert_tokens_to_ids("yes")
|
| 343 |
self.token_false_id = self.tokenizer.convert_tokens_to_ids("no")
|
| 344 |
|
| 345 |
+
def rerank(self, query: str, passages: List[str], top_k: int = 3, threshold: float = 0.5) -> List[str]:
|
| 346 |
"""
|
| 347 |
+
Rerank passages based on relevance to query using min-max normalized scores.
|
| 348 |
|
| 349 |
Args:
|
| 350 |
query (str): Query string.
|
| 351 |
passages (List[str]): List of passages.
|
| 352 |
top_k (int): Number of top passages to return.
|
| 353 |
+
threshold (float): Minimum normalized score to include passage.
|
| 354 |
|
| 355 |
Returns:
|
| 356 |
+
List[str]: Top-k most relevant passages above threshold.
|
| 357 |
"""
|
| 358 |
+
inputs = [
|
| 359 |
+
self.tokenizer(f"{query} [SEP] {p}", return_tensors="pt", truncation=True, padding=True).to(self.device)
|
| 360 |
+
for p in passages
|
| 361 |
+
]
|
| 362 |
scores = []
|
| 363 |
|
| 364 |
with torch.no_grad():
|
| 365 |
for inp in inputs:
|
| 366 |
logits = self.model(**inp).logits
|
| 367 |
+
# print("logits:", logits)
|
| 368 |
score = torch.softmax(logits, dim=1)[0, 1].item() # probability of relevance
|
| 369 |
+
scores.append(score)
|
| 370 |
+
|
| 371 |
+
print(f"Softmax Scores: {scores}")
|
| 372 |
+
|
| 373 |
+
# Min-max normalize the scores
|
| 374 |
+
scores_np = np.array(scores)
|
| 375 |
+
min_score = scores_np.min()
|
| 376 |
+
max_score = scores_np.max()
|
| 377 |
+
if max_score == min_score:
|
| 378 |
+
norm_scores = np.ones_like(scores_np)
|
| 379 |
+
else:
|
| 380 |
+
norm_scores = (scores_np - min_score) / (max_score - min_score)
|
| 381 |
|
| 382 |
+
print(f"Normalized Scores: {norm_scores}")
|
| 383 |
+
# Filter based on normalized threshold
|
| 384 |
+
filtered = [(i, s) for i, s in enumerate(norm_scores) if s > threshold]
|
| 385 |
+
print(f"Filtered: {filtered}")
|
| 386 |
+
|
| 387 |
+
# Sort by normalized score descending
|
| 388 |
+
filtered.sort(key=lambda x: x[1], reverse=True)
|
| 389 |
+
|
| 390 |
+
# Select top_k passages
|
| 391 |
+
top_passages = [passages[i] for i, _ in filtered]
|
| 392 |
+
|
| 393 |
+
return top_passages
|
| 394 |
|
|
|
|
|
|
|
|
|
|
| 395 |
|
| 396 |
@retry_on_ratelimit(max_retries=6, base_delay=0.5, max_delay=5.0)
|
| 397 |
def call_api(self, prompt: str) -> str:
|
|
|
|
| 470 |
# print("Using Hugging Face reranker for classification.")
|
| 471 |
return self.reranker.rerank(query, passages, top_k=top_k)
|
| 472 |
response = self.reranker.rerank(query,passages)
|
| 473 |
+
print(f"response: {response}")
|
| 474 |
# print("DONNNNE")
|
| 475 |
# NVIDIA reranker path
|
| 476 |
return response
|
|
|
|
| 490 |
# print(f"Content successfully chunked: {chunks}")
|
| 491 |
classified_chunks = self.classify_chunks(chunks, hf=hf) # conditional reranker
|
| 492 |
# extracting the content
|
| 493 |
+
|
| 494 |
+
# classified_chunks = [chunk.page_content for chunk in classified_chunks]
|
| 495 |
# print(f"Classified Chunks {len(classified_chunks)}")
|
| 496 |
# print(classified_chunks)
|
| 497 |
# print('='*80)
|