katerynaCh commited on
Commit
0ff7a3c
·
verified ·
1 Parent(s): ab7b5c5

Upload folder using huggingface_hub

Browse files
README.md CHANGED
@@ -34,7 +34,7 @@ Transformer-based vision-encoder-decoder model
34
 
35
  ### Network Architecture
36
  * Vision Encoder: ViT-H model (https://huggingface.co/nvidia/C-RADIO)<br>
37
- * Adapter Layer: 1D convolutions & norms to compress dimensionality and sequence length of the latent space (13184 tokens to 3201 tokens)<br>
38
  * Decoder: mBart [1] 10 blocks<br>
39
  * Tokenizer: Use of the tokenizer included in this model is governed by the [CC-BY-4.0 license](https://creativecommons.org/licenses/by/4.0/)<br>
40
  * Number of Parameters: < 1B<br>
@@ -51,7 +51,7 @@ Carbon Emissions: 3.21 tCO2e <br>
51
  * Input Type(s): Red, Green, Blue (RGB) + Prompt (String)
52
  * Input Parameters: 2D, 1D
53
  - Other Properties Related to Input:
54
- - Max Input Resolution (Width, Height): 1648, 2048
55
  - Min Input Resolution (Width, Height): 1024, 1280
56
  - Channel Count: 3
57
 
@@ -78,6 +78,7 @@ The integration of foundation and fine-tuned models into AI systems requires add
78
  ## Model Version:
79
 
80
  V1.1-Light
 
81
 
82
  ## Quick Start
83
 
@@ -95,15 +96,6 @@ from PIL import Image, ImageDraw
95
  from transformers import AutoModel, AutoProcessor, AutoTokenizer, AutoConfig, AutoImageProcessor, GenerationConfig
96
  from postprocessing import extract_classes_bboxes, transform_bbox_to_original, postprocess_text
97
 
98
- from hf_nemotron_parse_config import NemotronParseConfig
99
- from hf_nemotron_parse_modeling import NemotronParseForConditionalGeneration
100
- from hf_nemotron_parse_processor import NemotronParseProcessor, NemotronParseImageProcessor
101
-
102
- AutoConfig.register("nemotron_parse", NemotronParseConfig)
103
- AutoModel.register(NemotronParseConfig, NemotronParseForConditionalGeneration)
104
- AutoProcessor.register("nemotron_parse", NemotronParseProcessor)
105
- AutoImageProcessor.register("nemotron_parse", NemotronParseImageProcessor)
106
-
107
  # Load model and processor
108
  model_path = "nvidia/NVIDIA-Nemotron-Parse-v1.1-Light" # Or use a local path
109
  device = "cuda:0"
@@ -131,6 +123,15 @@ outputs = model.generate(**inputs, generation_config=generation_config)
131
 
132
  # Decode the generated text
133
  generated_text = processor.batch_decode(outputs, skip_special_tokens=True)[0]
 
 
 
 
 
 
 
 
 
134
  classes, bboxes, texts = extract_classes_bboxes(generated_text)
135
  bboxes = [transform_bbox_to_original(bbox, image.width, image.height) for bbox in bboxes]
136
 
@@ -148,6 +149,7 @@ for bbox in bboxes:
148
  draw.rectangle((bbox[0], bbox[1], bbox[2], bbox[3]), outline="red")
149
  ```
150
 
 
151
  ## Training, Testing, and Evaluation Datasets:
152
 
153
 
 
34
 
35
  ### Network Architecture
36
  * Vision Encoder: ViT-H model (https://huggingface.co/nvidia/C-RADIO)<br>
37
+ * Adapter Layer: 1D convolutions & norms to compress dimensionality and sequence length of the latent space (13184 tokens to 833 tokens)<br>
38
  * Decoder: mBart [1] 10 blocks<br>
39
  * Tokenizer: Use of the tokenizer included in this model is governed by the [CC-BY-4.0 license](https://creativecommons.org/licenses/by/4.0/)<br>
40
  * Number of Parameters: < 1B<br>
 
51
  * Input Type(s): Red, Green, Blue (RGB) + Prompt (String)
52
  * Input Parameters: 2D, 1D
53
  - Other Properties Related to Input:
54
+ - Max Input Resolution (Width, Height): 1668, 2048
55
  - Min Input Resolution (Width, Height): 1024, 1280
56
  - Channel Count: 3
57
 
 
78
  ## Model Version:
79
 
80
  V1.1-Light
81
+ This version preserves reading order of Tables, Captions, Pictures, and other elements as well as offers 20% speed improvement compared to Nemotron-Parse-1.1
82
 
83
  ## Quick Start
84
 
 
96
  from transformers import AutoModel, AutoProcessor, AutoTokenizer, AutoConfig, AutoImageProcessor, GenerationConfig
97
  from postprocessing import extract_classes_bboxes, transform_bbox_to_original, postprocess_text
98
 
 
 
 
 
 
 
 
 
 
99
  # Load model and processor
100
  model_path = "nvidia/NVIDIA-Nemotron-Parse-v1.1-Light" # Or use a local path
101
  device = "cuda:0"
 
123
 
124
  # Decode the generated text
125
  generated_text = processor.batch_decode(outputs, skip_special_tokens=True)[0]
126
+ ```
127
+
128
+ ### Postprocessing
129
+
130
+ ```python
131
+ import torch
132
+ from PIL import Image, ImageDraw
133
+ from transformers import AutoModel, AutoProcessor, AutoTokenizer, AutoConfig, AutoImageProcessor, GenerationConfig
134
+ from postprocessing import extract_classes_bboxes, transform_bbox_to_original, postprocess_text
135
  classes, bboxes, texts = extract_classes_bboxes(generated_text)
136
  bboxes = [transform_bbox_to_original(bbox, image.width, image.height) for bbox in bboxes]
137
 
 
149
  draw.rectangle((bbox[0], bbox[1], bbox[2], bbox[3]), outline="red")
150
  ```
151
 
152
+
153
  ## Training, Testing, and Evaluation Datasets:
154
 
155
 
__init__.py ADDED
File without changes
example.py ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from PIL import Image, ImageDraw
3
+ from transformers import AutoModel, AutoProcessor, AutoTokenizer, AutoConfig, AutoImageProcessor, GenerationConfig
4
+ from postprocessing import extract_classes_bboxes, transform_bbox_to_original, postprocess_text
5
+
6
+ # Load model and processor
7
+ model_path = "nvidia/NVIDIA-Nemotron-Parse-v1.1-Light" # Or use a local path
8
+ device = "cuda:0"
9
+
10
+ model = AutoModel.from_pretrained(
11
+ model_path,
12
+ trust_remote_code=True,
13
+ torch_dtype=torch.bfloat16
14
+ ).to(device).eval()
15
+ tokenizer = AutoTokenizer.from_pretrained(model_path)
16
+ processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
17
+
18
+ # Load image
19
+ image = Image.open("path/to/your/image.jpg")
20
+ task_prompt = "</s><s><predict_bbox><predict_classes><output_markdown>"
21
+
22
+ # Process image
23
+ inputs = processor(images=[image], text=task_prompt, return_tensors="pt").to(device)
24
+ prompt_ids = processor.tokenizer.encode(task_prompt, return_tensors="pt", add_special_tokens=False).cuda()
25
+
26
+
27
+ generation_config = GenerationConfig.from_pretrained(model_path, trust_remote_code=True)
28
+ # Generate text
29
+ outputs = model.generate(**inputs, generation_config=generation_config)
30
+
31
+ # Decode the generated text
32
+ generated_text = processor.batch_decode(outputs, skip_special_tokens=True)[0]
33
+ classes, bboxes, texts = extract_classes_bboxes(generated_text)
34
+ bboxes = [transform_bbox_to_original(bbox, image.width, image.height) for bbox in bboxes]
35
+
36
+ # Specify output formats for postprocessing
37
+ table_format = 'latex' # latex | HTML | markdown
38
+ text_format = 'markdown' # markdown | plain
39
+ blank_text_in_figures = False # remove text inside 'Picture' class
40
+ texts = [postprocess_text(text, cls = cls, table_format=table_format, text_format=text_format, blank_text_in_figures=blank_text_in_figures) for text, cls in zip(texts, classes)]
41
+
42
+ for cl, bb, txt in zip(classes, bboxes, texts):
43
+ print(cl, ': ', txt)
44
+
45
+ # OPTIONAL - Draw bounding boxes
46
+ draw = ImageDraw.Draw(image)
47
+ for bbox in bboxes:
48
+ draw.rectangle((bbox[0], bbox[1], bbox[2], bbox[3]), outline="red")
49
+
hf_nemotron_parse_modeling.py CHANGED
@@ -13,7 +13,7 @@ from typing import Optional, List, Union, Tuple
13
  import warnings
14
  from transformers.modeling_outputs import BaseModelOutput
15
  from transformers.models.encoder_decoder.modeling_encoder_decoder import shift_tokens_right
16
- from hf_nemotron_parse_config import NemotronParseLightConfig
17
  from transformers import AutoModel
18
  import time
19
  from transformers.modeling_attn_mask_utils import (
 
13
  import warnings
14
  from transformers.modeling_outputs import BaseModelOutput
15
  from transformers.models.encoder_decoder.modeling_encoder_decoder import shift_tokens_right
16
+ from .hf_nemotron_parse_config import NemotronParseLightConfig
17
  from transformers import AutoModel
18
  import time
19
  from transformers.modeling_attn_mask_utils import (
hf_nemotron_parse_processor.py CHANGED
@@ -252,7 +252,7 @@ class NemotronParseLightImageProcessor(BaseImageProcessor, ImageProcessingMixin)
252
  class NemotronParseLightProcessor(ProcessorMixin):
253
 
254
  attributes = ["image_processor", "tokenizer"]
255
- image_processor_class = "NemotronParseLightImageProcessor"
256
  tokenizer_class = ("PreTrainedTokenizer", "PreTrainedTokenizerFast")
257
 
258
  def __init__(self, image_processor=None, tokenizer=None, **kwargs):
@@ -350,8 +350,24 @@ class NemotronParseLightProcessor(ProcessorMixin):
350
 
351
  This method is compatible with AutoProcessor.from_pretrained().
352
  """
353
- # Use the parent class's from_pretrained method which handles auto-loading
354
- return super().from_pretrained(pretrained_model_name_or_path, **kwargs)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
355
 
356
  def save_pretrained(self, save_directory, **kwargs):
357
  """
 
252
  class NemotronParseLightProcessor(ProcessorMixin):
253
 
254
  attributes = ["image_processor", "tokenizer"]
255
+ image_processor_class = "AutoImageProcessor"
256
  tokenizer_class = ("PreTrainedTokenizer", "PreTrainedTokenizerFast")
257
 
258
  def __init__(self, image_processor=None, tokenizer=None, **kwargs):
 
350
 
351
  This method is compatible with AutoProcessor.from_pretrained().
352
  """
353
+ # Explicitly load subcomponents via Auto* to ensure remote auto_map is honored.
354
+ from transformers import AutoImageProcessor, AutoTokenizer
355
+ trust_remote_code = kwargs.get("trust_remote_code", None)
356
+ revision = kwargs.get("revision", None)
357
+ token = kwargs.get("token", None)
358
+ image_processor = AutoImageProcessor.from_pretrained(
359
+ pretrained_model_name_or_path,
360
+ trust_remote_code=trust_remote_code,
361
+ revision=revision,
362
+ token=token,
363
+ )
364
+ tokenizer = AutoTokenizer.from_pretrained(
365
+ pretrained_model_name_or_path,
366
+ trust_remote_code=trust_remote_code,
367
+ revision=revision,
368
+ token=token,
369
+ )
370
+ return cls(image_processor=image_processor, tokenizer=tokenizer)
371
 
372
  def save_pretrained(self, save_directory, **kwargs):
373
  """
preprocessor_config.json CHANGED
@@ -2,6 +2,10 @@
2
  "feature_extractor_type": "NemotronParseLightImageProcessor",
3
  "image_processor_type": "NemotronParseLightImageProcessor",
4
  "processor_class": "NemotronParseLightProcessor",
 
 
 
 
5
  "do_normalize": false,
6
  "do_rescale": true,
7
  "rescale_factor": 0.00392156862745098,
 
2
  "feature_extractor_type": "NemotronParseLightImageProcessor",
3
  "image_processor_type": "NemotronParseLightImageProcessor",
4
  "processor_class": "NemotronParseLightProcessor",
5
+ "auto_map": {
6
+ "AutoImageProcessor": "hf_nemotron_parse_processor.NemotronParseLightImageProcessor",
7
+ "AutoProcessor": "hf_nemotron_parse_processor.NemotronParseLightProcessor"
8
+ },
9
  "do_normalize": false,
10
  "do_rescale": true,
11
  "rescale_factor": 0.00392156862745098,