Upload folder using huggingface_hub
Browse files- README.md +13 -11
- __init__.py +0 -0
- example.py +49 -0
- hf_nemotron_parse_modeling.py +1 -1
- hf_nemotron_parse_processor.py +19 -3
- preprocessor_config.json +4 -0
README.md
CHANGED
|
@@ -34,7 +34,7 @@ Transformer-based vision-encoder-decoder model
|
|
| 34 |
|
| 35 |
### Network Architecture
|
| 36 |
* Vision Encoder: ViT-H model (https://huggingface.co/nvidia/C-RADIO)<br>
|
| 37 |
-
* Adapter Layer: 1D convolutions & norms to compress dimensionality and sequence length of the latent space (13184 tokens to
|
| 38 |
* Decoder: mBart [1] 10 blocks<br>
|
| 39 |
* Tokenizer: Use of the tokenizer included in this model is governed by the [CC-BY-4.0 license](https://creativecommons.org/licenses/by/4.0/)<br>
|
| 40 |
* Number of Parameters: < 1B<br>
|
|
@@ -51,7 +51,7 @@ Carbon Emissions: 3.21 tCO2e <br>
|
|
| 51 |
* Input Type(s): Red, Green, Blue (RGB) + Prompt (String)
|
| 52 |
* Input Parameters: 2D, 1D
|
| 53 |
- Other Properties Related to Input:
|
| 54 |
-
- Max Input Resolution (Width, Height):
|
| 55 |
- Min Input Resolution (Width, Height): 1024, 1280
|
| 56 |
- Channel Count: 3
|
| 57 |
|
|
@@ -78,6 +78,7 @@ The integration of foundation and fine-tuned models into AI systems requires add
|
|
| 78 |
## Model Version:
|
| 79 |
|
| 80 |
V1.1-Light
|
|
|
|
| 81 |
|
| 82 |
## Quick Start
|
| 83 |
|
|
@@ -95,15 +96,6 @@ from PIL import Image, ImageDraw
|
|
| 95 |
from transformers import AutoModel, AutoProcessor, AutoTokenizer, AutoConfig, AutoImageProcessor, GenerationConfig
|
| 96 |
from postprocessing import extract_classes_bboxes, transform_bbox_to_original, postprocess_text
|
| 97 |
|
| 98 |
-
from hf_nemotron_parse_config import NemotronParseConfig
|
| 99 |
-
from hf_nemotron_parse_modeling import NemotronParseForConditionalGeneration
|
| 100 |
-
from hf_nemotron_parse_processor import NemotronParseProcessor, NemotronParseImageProcessor
|
| 101 |
-
|
| 102 |
-
AutoConfig.register("nemotron_parse", NemotronParseConfig)
|
| 103 |
-
AutoModel.register(NemotronParseConfig, NemotronParseForConditionalGeneration)
|
| 104 |
-
AutoProcessor.register("nemotron_parse", NemotronParseProcessor)
|
| 105 |
-
AutoImageProcessor.register("nemotron_parse", NemotronParseImageProcessor)
|
| 106 |
-
|
| 107 |
# Load model and processor
|
| 108 |
model_path = "nvidia/NVIDIA-Nemotron-Parse-v1.1-Light" # Or use a local path
|
| 109 |
device = "cuda:0"
|
|
@@ -131,6 +123,15 @@ outputs = model.generate(**inputs, generation_config=generation_config)
|
|
| 131 |
|
| 132 |
# Decode the generated text
|
| 133 |
generated_text = processor.batch_decode(outputs, skip_special_tokens=True)[0]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 134 |
classes, bboxes, texts = extract_classes_bboxes(generated_text)
|
| 135 |
bboxes = [transform_bbox_to_original(bbox, image.width, image.height) for bbox in bboxes]
|
| 136 |
|
|
@@ -148,6 +149,7 @@ for bbox in bboxes:
|
|
| 148 |
draw.rectangle((bbox[0], bbox[1], bbox[2], bbox[3]), outline="red")
|
| 149 |
```
|
| 150 |
|
|
|
|
| 151 |
## Training, Testing, and Evaluation Datasets:
|
| 152 |
|
| 153 |
|
|
|
|
| 34 |
|
| 35 |
### Network Architecture
|
| 36 |
* Vision Encoder: ViT-H model (https://huggingface.co/nvidia/C-RADIO)<br>
|
| 37 |
+
* Adapter Layer: 1D convolutions & norms to compress dimensionality and sequence length of the latent space (13184 tokens to 833 tokens)<br>
|
| 38 |
* Decoder: mBart [1] 10 blocks<br>
|
| 39 |
* Tokenizer: Use of the tokenizer included in this model is governed by the [CC-BY-4.0 license](https://creativecommons.org/licenses/by/4.0/)<br>
|
| 40 |
* Number of Parameters: < 1B<br>
|
|
|
|
| 51 |
* Input Type(s): Red, Green, Blue (RGB) + Prompt (String)
|
| 52 |
* Input Parameters: 2D, 1D
|
| 53 |
- Other Properties Related to Input:
|
| 54 |
+
- Max Input Resolution (Width, Height): 1668, 2048
|
| 55 |
- Min Input Resolution (Width, Height): 1024, 1280
|
| 56 |
- Channel Count: 3
|
| 57 |
|
|
|
|
| 78 |
## Model Version:
|
| 79 |
|
| 80 |
V1.1-Light
|
| 81 |
+
This version preserves reading order of Tables, Captions, Pictures, and other elements as well as offers 20% speed improvement compared to Nemotron-Parse-1.1
|
| 82 |
|
| 83 |
## Quick Start
|
| 84 |
|
|
|
|
| 96 |
from transformers import AutoModel, AutoProcessor, AutoTokenizer, AutoConfig, AutoImageProcessor, GenerationConfig
|
| 97 |
from postprocessing import extract_classes_bboxes, transform_bbox_to_original, postprocess_text
|
| 98 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 99 |
# Load model and processor
|
| 100 |
model_path = "nvidia/NVIDIA-Nemotron-Parse-v1.1-Light" # Or use a local path
|
| 101 |
device = "cuda:0"
|
|
|
|
| 123 |
|
| 124 |
# Decode the generated text
|
| 125 |
generated_text = processor.batch_decode(outputs, skip_special_tokens=True)[0]
|
| 126 |
+
```
|
| 127 |
+
|
| 128 |
+
### Postprocessing
|
| 129 |
+
|
| 130 |
+
```python
|
| 131 |
+
import torch
|
| 132 |
+
from PIL import Image, ImageDraw
|
| 133 |
+
from transformers import AutoModel, AutoProcessor, AutoTokenizer, AutoConfig, AutoImageProcessor, GenerationConfig
|
| 134 |
+
from postprocessing import extract_classes_bboxes, transform_bbox_to_original, postprocess_text
|
| 135 |
classes, bboxes, texts = extract_classes_bboxes(generated_text)
|
| 136 |
bboxes = [transform_bbox_to_original(bbox, image.width, image.height) for bbox in bboxes]
|
| 137 |
|
|
|
|
| 149 |
draw.rectangle((bbox[0], bbox[1], bbox[2], bbox[3]), outline="red")
|
| 150 |
```
|
| 151 |
|
| 152 |
+
|
| 153 |
## Training, Testing, and Evaluation Datasets:
|
| 154 |
|
| 155 |
|
__init__.py
ADDED
|
File without changes
|
example.py
ADDED
|
@@ -0,0 +1,49 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import torch
|
| 2 |
+
from PIL import Image, ImageDraw
|
| 3 |
+
from transformers import AutoModel, AutoProcessor, AutoTokenizer, AutoConfig, AutoImageProcessor, GenerationConfig
|
| 4 |
+
from postprocessing import extract_classes_bboxes, transform_bbox_to_original, postprocess_text
|
| 5 |
+
|
| 6 |
+
# Load model and processor
|
| 7 |
+
model_path = "nvidia/NVIDIA-Nemotron-Parse-v1.1-Light" # Or use a local path
|
| 8 |
+
device = "cuda:0"
|
| 9 |
+
|
| 10 |
+
model = AutoModel.from_pretrained(
|
| 11 |
+
model_path,
|
| 12 |
+
trust_remote_code=True,
|
| 13 |
+
torch_dtype=torch.bfloat16
|
| 14 |
+
).to(device).eval()
|
| 15 |
+
tokenizer = AutoTokenizer.from_pretrained(model_path)
|
| 16 |
+
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
|
| 17 |
+
|
| 18 |
+
# Load image
|
| 19 |
+
image = Image.open("path/to/your/image.jpg")
|
| 20 |
+
task_prompt = "</s><s><predict_bbox><predict_classes><output_markdown>"
|
| 21 |
+
|
| 22 |
+
# Process image
|
| 23 |
+
inputs = processor(images=[image], text=task_prompt, return_tensors="pt").to(device)
|
| 24 |
+
prompt_ids = processor.tokenizer.encode(task_prompt, return_tensors="pt", add_special_tokens=False).cuda()
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
generation_config = GenerationConfig.from_pretrained(model_path, trust_remote_code=True)
|
| 28 |
+
# Generate text
|
| 29 |
+
outputs = model.generate(**inputs, generation_config=generation_config)
|
| 30 |
+
|
| 31 |
+
# Decode the generated text
|
| 32 |
+
generated_text = processor.batch_decode(outputs, skip_special_tokens=True)[0]
|
| 33 |
+
classes, bboxes, texts = extract_classes_bboxes(generated_text)
|
| 34 |
+
bboxes = [transform_bbox_to_original(bbox, image.width, image.height) for bbox in bboxes]
|
| 35 |
+
|
| 36 |
+
# Specify output formats for postprocessing
|
| 37 |
+
table_format = 'latex' # latex | HTML | markdown
|
| 38 |
+
text_format = 'markdown' # markdown | plain
|
| 39 |
+
blank_text_in_figures = False # remove text inside 'Picture' class
|
| 40 |
+
texts = [postprocess_text(text, cls = cls, table_format=table_format, text_format=text_format, blank_text_in_figures=blank_text_in_figures) for text, cls in zip(texts, classes)]
|
| 41 |
+
|
| 42 |
+
for cl, bb, txt in zip(classes, bboxes, texts):
|
| 43 |
+
print(cl, ': ', txt)
|
| 44 |
+
|
| 45 |
+
# OPTIONAL - Draw bounding boxes
|
| 46 |
+
draw = ImageDraw.Draw(image)
|
| 47 |
+
for bbox in bboxes:
|
| 48 |
+
draw.rectangle((bbox[0], bbox[1], bbox[2], bbox[3]), outline="red")
|
| 49 |
+
|
hf_nemotron_parse_modeling.py
CHANGED
|
@@ -13,7 +13,7 @@ from typing import Optional, List, Union, Tuple
|
|
| 13 |
import warnings
|
| 14 |
from transformers.modeling_outputs import BaseModelOutput
|
| 15 |
from transformers.models.encoder_decoder.modeling_encoder_decoder import shift_tokens_right
|
| 16 |
-
from hf_nemotron_parse_config import NemotronParseLightConfig
|
| 17 |
from transformers import AutoModel
|
| 18 |
import time
|
| 19 |
from transformers.modeling_attn_mask_utils import (
|
|
|
|
| 13 |
import warnings
|
| 14 |
from transformers.modeling_outputs import BaseModelOutput
|
| 15 |
from transformers.models.encoder_decoder.modeling_encoder_decoder import shift_tokens_right
|
| 16 |
+
from .hf_nemotron_parse_config import NemotronParseLightConfig
|
| 17 |
from transformers import AutoModel
|
| 18 |
import time
|
| 19 |
from transformers.modeling_attn_mask_utils import (
|
hf_nemotron_parse_processor.py
CHANGED
|
@@ -252,7 +252,7 @@ class NemotronParseLightImageProcessor(BaseImageProcessor, ImageProcessingMixin)
|
|
| 252 |
class NemotronParseLightProcessor(ProcessorMixin):
|
| 253 |
|
| 254 |
attributes = ["image_processor", "tokenizer"]
|
| 255 |
-
image_processor_class = "
|
| 256 |
tokenizer_class = ("PreTrainedTokenizer", "PreTrainedTokenizerFast")
|
| 257 |
|
| 258 |
def __init__(self, image_processor=None, tokenizer=None, **kwargs):
|
|
@@ -350,8 +350,24 @@ class NemotronParseLightProcessor(ProcessorMixin):
|
|
| 350 |
|
| 351 |
This method is compatible with AutoProcessor.from_pretrained().
|
| 352 |
"""
|
| 353 |
-
#
|
| 354 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 355 |
|
| 356 |
def save_pretrained(self, save_directory, **kwargs):
|
| 357 |
"""
|
|
|
|
| 252 |
class NemotronParseLightProcessor(ProcessorMixin):
|
| 253 |
|
| 254 |
attributes = ["image_processor", "tokenizer"]
|
| 255 |
+
image_processor_class = "AutoImageProcessor"
|
| 256 |
tokenizer_class = ("PreTrainedTokenizer", "PreTrainedTokenizerFast")
|
| 257 |
|
| 258 |
def __init__(self, image_processor=None, tokenizer=None, **kwargs):
|
|
|
|
| 350 |
|
| 351 |
This method is compatible with AutoProcessor.from_pretrained().
|
| 352 |
"""
|
| 353 |
+
# Explicitly load subcomponents via Auto* to ensure remote auto_map is honored.
|
| 354 |
+
from transformers import AutoImageProcessor, AutoTokenizer
|
| 355 |
+
trust_remote_code = kwargs.get("trust_remote_code", None)
|
| 356 |
+
revision = kwargs.get("revision", None)
|
| 357 |
+
token = kwargs.get("token", None)
|
| 358 |
+
image_processor = AutoImageProcessor.from_pretrained(
|
| 359 |
+
pretrained_model_name_or_path,
|
| 360 |
+
trust_remote_code=trust_remote_code,
|
| 361 |
+
revision=revision,
|
| 362 |
+
token=token,
|
| 363 |
+
)
|
| 364 |
+
tokenizer = AutoTokenizer.from_pretrained(
|
| 365 |
+
pretrained_model_name_or_path,
|
| 366 |
+
trust_remote_code=trust_remote_code,
|
| 367 |
+
revision=revision,
|
| 368 |
+
token=token,
|
| 369 |
+
)
|
| 370 |
+
return cls(image_processor=image_processor, tokenizer=tokenizer)
|
| 371 |
|
| 372 |
def save_pretrained(self, save_directory, **kwargs):
|
| 373 |
"""
|
preprocessor_config.json
CHANGED
|
@@ -2,6 +2,10 @@
|
|
| 2 |
"feature_extractor_type": "NemotronParseLightImageProcessor",
|
| 3 |
"image_processor_type": "NemotronParseLightImageProcessor",
|
| 4 |
"processor_class": "NemotronParseLightProcessor",
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5 |
"do_normalize": false,
|
| 6 |
"do_rescale": true,
|
| 7 |
"rescale_factor": 0.00392156862745098,
|
|
|
|
| 2 |
"feature_extractor_type": "NemotronParseLightImageProcessor",
|
| 3 |
"image_processor_type": "NemotronParseLightImageProcessor",
|
| 4 |
"processor_class": "NemotronParseLightProcessor",
|
| 5 |
+
"auto_map": {
|
| 6 |
+
"AutoImageProcessor": "hf_nemotron_parse_processor.NemotronParseLightImageProcessor",
|
| 7 |
+
"AutoProcessor": "hf_nemotron_parse_processor.NemotronParseLightProcessor"
|
| 8 |
+
},
|
| 9 |
"do_normalize": false,
|
| 10 |
"do_rescale": true,
|
| 11 |
"rescale_factor": 0.00392156862745098,
|