Spaces:

garage-lab
/

MCP_HTML2JSON

Paused

App Files Files Community

abdo-Mansour commited on Jul 1

Commit

7924dcb

2 Parent(s): 98d5f67 b4fe9b6

eh

Browse files

Files changed (7) hide show

app.py +21 -51
test.ipynb +0 -0
web2json/__pycache__/ai_extractor.cpython-311.pyc +0 -0
web2json/__pycache__/pipeline.cpython-311.pyc +0 -0
web2json/__pycache__/postprocessor.cpython-311.pyc +0 -0
web2json/__pycache__/preprocessor.cpython-311.pyc +0 -0
web2json/ai_extractor.py +60 -45

app.py CHANGED Viewed

@@ -3,15 +3,32 @@ import pandas as pd
 import gradio as gr
 from typing import Dict, Any, Type
 from web2json.preprocessor import BasicPreprocessor
-from web2json.ai_extractor import AIExtractor,LLMClassifierExtractor,NvidiaLLMClient
 from web2json.postprocessor import PostProcessor
 from web2json.pipeline import Pipeline
 from pydantic import BaseModel, Field, create_model
 import os
 import dotenv
 dotenv.load_dotenv()
 def parse_schema_input(schema_input: str) -> Type[BaseModel]:
     """
     Convert user schema input to a Pydantic BaseModel.
@@ -170,66 +187,19 @@ def webpage_to_json(content: str, is_url: bool, schema: BaseModel) -> Dict[str,
     - Preserve the original formatting and context where relevant
     - Return the extracted data in the format specified by the schema"""
-    classification_prompt_template = """
-    # HTML Chunk Relevance Classification Prompt
-    You are an HTML content classifier. Your task is to analyze an HTML chunk against a given schema and determine if the content is relevant.
-    ## Instructions:
-    1. Carefully examine the provided HTML chunk
-    2. Compare it against the given schema/criteria
-    3. Determine if the HTML chunk contains content that matches or is relevant to the schema
-    4. Respond with ONLY a JSON object containing a single field "relevant" with value 1 (relevant) or 0 (not relevant)
-    ## Input Format:
-    **Schema/Criteria:**
-    {schema}
-    **HTML Chunk:**
-    ```html
-    {content}
-    ```
-    ## Output Format:
-    Your response must be ONLY a valid JSON object with no additional text:
-    ```json
-    {{
-    "relevant": 1
-    }}
-    ```
-    OR
-    ```json
-    {{
-    "relevant": 0
-    }}
-    ```
-    ## Classification Rules:
-    - Output 1 if the HTML chunk contains content that matches the schema criteria
-    - Output 0 if the HTML chunk does not contain relevant content
-    - Consider semantic meaning, not just exact keyword matches
-    - Look at text content, attributes, structure, and context
-    - Ignore purely structural HTML elements (like divs, spans) unless they contain relevant content
-    - Be STRICT in your evaluation - only mark as relevant (1) if there is clear, meaningful content that directly relates to the schema
-    - Empty elements, placeholder text, navigation menus, headers/footers, and generic UI components should typically be marked as not relevant (0)
-    - The HTML chunk does not need to contain ALL schema information, but it must contain SUBSTANTIAL and SPECIFIC content related to the schema
-    CRITICAL: Your entire response MUST be exactly one JSON object. DO NOT include any explanations, reasoning, markdown formatting, code blocks, or additional text. Output ONLY the raw JSON object.
-    """
     # Initialize pipeline components
     # TODO: improve the RAG system and optimize (don't instantiate every time)
     preprocessor = BasicPreprocessor(config={'keep_tags': True})
     try:
         # llm = GeminiLLMClient(config={'api_key': os.getenv('GEMINI_API_KEY')})
         llm = NvidiaLLMClient(config={'api_key': os.getenv('NVIDIA_API_KEY'),'model_name': 'qwen/qwen2.5-7b-instruct'})
     except Exception as e:
         return {"error": f"Failed to initialize LLM client: {str(e)}"}
     # ai_extractor = RAGExtractor(llm_client=llm, prompt_template=prompt_template)
-    ai_extractor = LLMClassifierExtractor(llm_client=llm, prompt_template=prompt_template, classifier_prompt=classification_prompt_template)
     postprocessor = PostProcessor()
     pipeline = Pipeline(preprocessor, ai_extractor, postprocessor)

 import gradio as gr
 from typing import Dict, Any, Type
 from web2json.preprocessor import BasicPreprocessor
+from web2json.ai_extractor import AIExtractor,LLMClassifierExtractor,NvidiaLLMClient, NvidiaRerankerClient
 from web2json.postprocessor import PostProcessor
 from web2json.pipeline import Pipeline
 from pydantic import BaseModel, Field, create_model
 import os
 import dotenv
+import random
+import numpy as np
+import torch
 dotenv.load_dotenv()
+def seed_everything(seed=42):
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    if torch.cuda.is_available():
+        torch.cuda.manual_seed(seed)
+        torch.cuda.manual_seed_all(seed)  # if using multi-GPU
+    torch.backends.cudnn.deterministic = True
+    torch.backends.cudnn.benchmark = False
+seed_everything(22)
 def parse_schema_input(schema_input: str) -> Type[BaseModel]:
     """
     Convert user schema input to a Pydantic BaseModel.
     - Preserve the original formatting and context where relevant
     - Return the extracted data in the format specified by the schema"""
+    classification_prompt_template = schema.model_json_schema()
     # Initialize pipeline components
     # TODO: improve the RAG system and optimize (don't instantiate every time)
     preprocessor = BasicPreprocessor(config={'keep_tags': True})
     try:
         # llm = GeminiLLMClient(config={'api_key': os.getenv('GEMINI_API_KEY')})
         llm = NvidiaLLMClient(config={'api_key': os.getenv('NVIDIA_API_KEY'),'model_name': 'qwen/qwen2.5-7b-instruct'})
+        reranker = NvidiaRerankerClient(config={'api_key': os.getenv('NVIDIA_API_KEY'),'model_name': 'nv-rerank-qa-mistral-4b:1'})
     except Exception as e:
         return {"error": f"Failed to initialize LLM client: {str(e)}"}
     # ai_extractor = RAGExtractor(llm_client=llm, prompt_template=prompt_template)
+    ai_extractor = LLMClassifierExtractor(reranker=reranker, llm_client=llm, prompt_template=prompt_template, classifier_prompt=classification_prompt_template)
     postprocessor = PostProcessor()
     pipeline = Pipeline(preprocessor, ai_extractor, postprocessor)

test.ipynb DELETED Viewed

The diff for this file is too large to render. See raw diff

web2json/__pycache__/ai_extractor.cpython-311.pyc CHANGED Viewed

Binary files a/web2json/__pycache__/ai_extractor.cpython-311.pyc and b/web2json/__pycache__/ai_extractor.cpython-311.pyc differ

web2json/__pycache__/pipeline.cpython-311.pyc CHANGED Viewed

Binary files a/web2json/__pycache__/pipeline.cpython-311.pyc and b/web2json/__pycache__/pipeline.cpython-311.pyc differ

web2json/__pycache__/postprocessor.cpython-311.pyc CHANGED Viewed

Binary files a/web2json/__pycache__/postprocessor.cpython-311.pyc and b/web2json/__pycache__/postprocessor.cpython-311.pyc differ

web2json/__pycache__/preprocessor.cpython-311.pyc CHANGED Viewed

Binary files a/web2json/__pycache__/preprocessor.cpython-311.pyc and b/web2json/__pycache__/preprocessor.cpython-311.pyc differ

web2json/ai_extractor.py CHANGED Viewed

@@ -232,6 +232,7 @@ class NvidiaLLMClient(LLMClient):
         Returns:
             str: The generated text from the NVIDIA API.
         """
         response = self.client.chat.completions.create(
             model=self.model_name,
             messages=[{"role": "user", "content": prompt}],
@@ -286,50 +287,38 @@ class NvidiaRerankerClient(RerankerClient):
         self.model_name = model_name
     @retry_on_ratelimit(max_retries=6, base_delay=0.5, max_delay=5.0)
-    def rerank(self, query: str, passages: List[str], top_k: int = 3 , threshold: float = 0.5) -> List[Document]:
         # 1. Prepare and send documents for scoring
         docs = [Document(page_content=p) for p in passages]
-        # print("Bonjour")
-        # print(type(docs),docs)
-        # print(type(query),query)
         scored_docs = self.client.compress_documents(
             query=str(query),
             documents=docs
         )
-        # print(f"Scored Docs {scored_docs}")
-        # 2. Extract raw scores
         raw_scores = np.array([doc.metadata['relevance_score'] for doc in scored_docs], dtype=float)
-        # 3. Softmax normalization
-        exp_scores = np.exp(raw_scores - np.max(raw_scores))
-        softmax_scores = exp_scores / exp_scores.sum()
-        # 4. (Optional) Min–Max rescale of the softmax outputs
-        min_val, max_val = raw_scores.min(), raw_scores.max()
-        if max_val > min_val:
-            minmax_scores = (raw_scores - min_val) / (max_val - min_val)
-        else:
-            # all scores equal → set them all to 1
-            minmax_scores = np.ones_like(raw_scores)
-        # 5. Attach new scores back to metadata
-        for doc, s, mm in zip(scored_docs, softmax_scores, minmax_scores):
-            doc.metadata['softmax_score'] = float(s)
-            doc.metadata['minmax_score'] = float(mm)
-        # 6. Sort and return top_k by softmax_score
-        # Sort by softmax_score descending
-        sorted_docs = sorted(
-            scored_docs,
-            key=lambda d: d.metadata['softmax_score'],
-            reverse=True
-        )
-        # print("Ayeeeee")
-        # print("Docs Value:",sorted_docs)
-        # Filter by threshold
-        filtered_docs = [doc for doc in sorted_docs if doc.metadata['minmax_score'] >= threshold]
-        # print("Final", filtered_docs)
-        return filtered_docs
     # TODO: will I need it ?
@@ -353,32 +342,56 @@ class HFRerankerClient(LLMClient):
         self.token_true_id = self.tokenizer.convert_tokens_to_ids("yes")
         self.token_false_id = self.tokenizer.convert_tokens_to_ids("no")
-    def rerank(self, query: str, passages: List[str], top_k: int = 3) -> List[str]:
         """
-        Rerank passages based on relevance to query.
         Args:
             query (str): Query string.
             passages (List[str]): List of passages.
             top_k (int): Number of top passages to return.
         Returns:
-            List[str]: Top-k most relevant passages.
         """
-        inputs = [self.tokenizer(f"{query} [SEP] {p}", return_tensors="pt", truncation=True, padding=True).to(self.device) for p in passages]
         scores = []
         with torch.no_grad():
             for inp in inputs:
                 logits = self.model(**inp).logits
                 score = torch.softmax(logits, dim=1)[0, 1].item()  # probability of relevance
-                scores.append(score)
-        # print(f"Scores for passages: {scores}")
-        top_indices = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)[:top_k]
-        print(f"top indices: {top_indices}")
-        return [passages[i] for i in top_indices]
     @retry_on_ratelimit(max_retries=6, base_delay=0.5, max_delay=5.0)
     def call_api(self, prompt: str) -> str:
@@ -457,6 +470,7 @@ class LLMClassifierExtractor(AIExtractor):
             # print("Using Hugging Face reranker for classification.")
             return self.reranker.rerank(query, passages, top_k=top_k)
         response = self.reranker.rerank(query,passages)
         # print("DONNNNE")
         # NVIDIA reranker path
         return response
@@ -476,7 +490,8 @@ class LLMClassifierExtractor(AIExtractor):
         # print(f"Content successfully chunked: {chunks}")
         classified_chunks = self.classify_chunks(chunks, hf=hf)  # conditional reranker
         # extracting the content
-        classified_chunks = [chunk.page_content for chunk in classified_chunks]
         # print(f"Classified Chunks {len(classified_chunks)}")
         # print(classified_chunks)
         # print('='*80)

         Returns:
             str: The generated text from the NVIDIA API.
         """
+        print("prompt: ", prompt)
         response = self.client.chat.completions.create(
             model=self.model_name,
             messages=[{"role": "user", "content": prompt}],
         self.model_name = model_name
     @retry_on_ratelimit(max_retries=6, base_delay=0.5, max_delay=5.0)
+    def rerank(self, query: str, passages: List[str], top_k: int = 3, threshold: float = 0.5) -> List[Document]:
         # 1. Prepare and send documents for scoring
         docs = [Document(page_content=p) for p in passages]
         scored_docs = self.client.compress_documents(
             query=str(query),
             documents=docs
         )
+        # 2. Extract raw scores and compute sigmoid probabilities
         raw_scores = np.array([doc.metadata['relevance_score'] for doc in scored_docs], dtype=float)
+        print(f"raw scores {raw_scores}")
+        p_scores = 1 / (1 + np.exp(-raw_scores))
+        print(f"Sigmoid scores: {p_scores}")
+        # 3. Min-max normalization
+        min_score = np.min(p_scores)
+        max_score = np.max(p_scores)
+        if max_score == min_score:
+            norm_scores = np.ones_like(p_scores)  # All values same — normalize to 1
+        else:
+            norm_scores = (p_scores - min_score) / (max_score - min_score)
+        print(f"Normalized scores: {norm_scores}")
+        # 4. Filter by threshold using normalized scores
+        scored_pairs = [(doc, norm) for doc, norm in zip(scored_docs, norm_scores) if norm > threshold]
+        print(f"Filtered pairs:\n{scored_pairs}")
+        # 5. Return top_k documents (already sorted by model, no need to re-sort)
+        top_docs = [doc.page_content for doc, _ in scored_pairs]
+        return top_docs
     # TODO: will I need it ?
         self.token_true_id = self.tokenizer.convert_tokens_to_ids("yes")
         self.token_false_id = self.tokenizer.convert_tokens_to_ids("no")
+    def rerank(self, query: str, passages: List[str], top_k: int = 3, threshold: float = 0.5) -> List[str]:
         """
+        Rerank passages based on relevance to query using min-max normalized scores.
         Args:
             query (str): Query string.
             passages (List[str]): List of passages.
             top_k (int): Number of top passages to return.
+            threshold (float): Minimum normalized score to include passage.
         Returns:
+            List[str]: Top-k most relevant passages above threshold.
         """
+        inputs = [
+            self.tokenizer(f"{query} [SEP] {p}", return_tensors="pt", truncation=True, padding=True).to(self.device)
+            for p in passages
+        ]
         scores = []
         with torch.no_grad():
             for inp in inputs:
                 logits = self.model(**inp).logits
+                # print("logits:", logits)
                 score = torch.softmax(logits, dim=1)[0, 1].item()  # probability of relevance
+                scores.append(score)
+        print(f"Softmax Scores: {scores}")
+        # Min-max normalize the scores
+        scores_np = np.array(scores)
+        min_score = scores_np.min()
+        max_score = scores_np.max()
+        if max_score == min_score:
+            norm_scores = np.ones_like(scores_np)
+        else:
+            norm_scores = (scores_np - min_score) / (max_score - min_score)
+        print(f"Normalized Scores: {norm_scores}")
+        # Filter based on normalized threshold
+        filtered = [(i, s) for i, s in enumerate(norm_scores) if s > threshold]
+        print(f"Filtered: {filtered}")
+        # Sort by normalized score descending
+        filtered.sort(key=lambda x: x[1], reverse=True)
+        # Select top_k passages
+        top_passages = [passages[i] for i, _ in filtered]
+        return top_passages
     @retry_on_ratelimit(max_retries=6, base_delay=0.5, max_delay=5.0)
     def call_api(self, prompt: str) -> str:
             # print("Using Hugging Face reranker for classification.")
             return self.reranker.rerank(query, passages, top_k=top_k)
         response = self.reranker.rerank(query,passages)
+        print(f"response: {response}")
         # print("DONNNNE")
         # NVIDIA reranker path
         return response
         # print(f"Content successfully chunked: {chunks}")
         classified_chunks = self.classify_chunks(chunks, hf=hf)  # conditional reranker
         # extracting the content
+        # classified_chunks = [chunk.page_content for chunk in classified_chunks]
         # print(f"Classified Chunks {len(classified_chunks)}")
         # print(classified_chunks)
         # print('='*80)