Upload folder using huggingface_hub
Browse files- README.md +61 -11
- generation_config.json +3 -1
- tokenizer_config.json +2 -1
README.md
CHANGED
|
@@ -1,14 +1,3 @@
|
|
| 1 |
-
---
|
| 2 |
-
license: other
|
| 3 |
-
license_name: nvidia-open-model-license
|
| 4 |
-
license_link: >-
|
| 5 |
-
https://www.nvidia.com/en-us/agreements/enterprise-software/nvidia-open-model-license/
|
| 6 |
-
pipeline_tag: image-text-to-text
|
| 7 |
-
library_name: transformers
|
| 8 |
-
tags:
|
| 9 |
-
- nvidia
|
| 10 |
-
- VLM
|
| 11 |
-
---
|
| 12 |
# Nemotron-Parse-Lite Overview
|
| 13 |
|
| 14 |
nemotron-parse-lite is a general purpose text-extraction model, specifically designed to handle documents. Given an image, nemotron-parse-lite is able to extract formatted-text, with bounding-boxes and the corresponding semantic class. This has downstream benefits for several tasks such as increasing the availability of training-data for Large Language Models (LLMs), improving the accuracy of retriever systems, and enhancing document understanding pipelines.
|
|
@@ -160,6 +149,67 @@ for bbox in bboxes:
|
|
| 160 |
draw.rectangle((bbox[0], bbox[1], bbox[2], bbox[3]), outline="red")
|
| 161 |
```
|
| 162 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 163 |
|
| 164 |
## Training, Testing, and Evaluation Datasets:
|
| 165 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
# Nemotron-Parse-Lite Overview
|
| 2 |
|
| 3 |
nemotron-parse-lite is a general purpose text-extraction model, specifically designed to handle documents. Given an image, nemotron-parse-lite is able to extract formatted-text, with bounding-boxes and the corresponding semantic class. This has downstream benefits for several tasks such as increasing the availability of training-data for Large Language Models (LLMs), improving the accuracy of retriever systems, and enhancing document understanding pipelines.
|
|
|
|
| 149 |
draw.rectangle((bbox[0], bbox[1], bbox[2], bbox[3]), outline="red")
|
| 150 |
```
|
| 151 |
|
| 152 |
+
## Inference with VLLM
|
| 153 |
+
|
| 154 |
+
### Install dependencies
|
| 155 |
+
|
| 156 |
+
```bash
|
| 157 |
+
uv venv --python 3.12 --seed
|
| 158 |
+
source .venv/bin/activate
|
| 159 |
+
uv pip install "git+https://github.com/amalad/vllm.git@nemotron_parse"
|
| 160 |
+
uv pip install timm albumentations
|
| 161 |
+
```
|
| 162 |
+
|
| 163 |
+
### Inference example
|
| 164 |
+
|
| 165 |
+
```python
|
| 166 |
+
from vllm import LLM, SamplingParams
|
| 167 |
+
from PIL import Image
|
| 168 |
+
|
| 169 |
+
|
| 170 |
+
sampling_params = SamplingParams(
|
| 171 |
+
temperature=0,
|
| 172 |
+
top_k=1,
|
| 173 |
+
repetition_penalty=1.1,
|
| 174 |
+
max_tokens=9000,
|
| 175 |
+
skip_special_tokens=False,
|
| 176 |
+
)
|
| 177 |
+
|
| 178 |
+
llm = LLM(
|
| 179 |
+
model="nvidia/NVIDIA-Nemotron-Parse-v1.1-Lite",
|
| 180 |
+
max_num_seqs=64,
|
| 181 |
+
limit_mm_per_prompt={"image": 1},
|
| 182 |
+
dtype="bfloat16",
|
| 183 |
+
trust_remote_code=True,
|
| 184 |
+
)
|
| 185 |
+
|
| 186 |
+
image = Image.open("<YOUR-IMAGE-PATH>")
|
| 187 |
+
|
| 188 |
+
prompts = [
|
| 189 |
+
{ # Implicit prompt
|
| 190 |
+
"prompt": "</s><s><predict_bbox><predict_classes><output_markdown>",
|
| 191 |
+
"multi_modal_data": {
|
| 192 |
+
"image": image
|
| 193 |
+
},
|
| 194 |
+
},
|
| 195 |
+
{ # Explicit encoder/decoder prompt
|
| 196 |
+
"encoder_prompt": {
|
| 197 |
+
"prompt": "",
|
| 198 |
+
"multi_modal_data": {
|
| 199 |
+
"image": image
|
| 200 |
+
},
|
| 201 |
+
},
|
| 202 |
+
"decoder_prompt": "</s><s><predict_bbox><predict_classes><output_markdown>",
|
| 203 |
+
},
|
| 204 |
+
]
|
| 205 |
+
|
| 206 |
+
outputs = llm.generate(prompts, sampling_params)
|
| 207 |
+
|
| 208 |
+
for output in outputs:
|
| 209 |
+
prompt = output.prompt
|
| 210 |
+
generated_text = output.outputs[0].text
|
| 211 |
+
print(f"Decoder prompt: {prompt!r}, Generated text: {generated_text!r}")
|
| 212 |
+
```
|
| 213 |
|
| 214 |
## Training, Testing, and Evaluation Datasets:
|
| 215 |
|
generation_config.json
CHANGED
|
@@ -9,5 +9,7 @@
|
|
| 9 |
"do_sample": false,
|
| 10 |
"num_beams": 1,
|
| 11 |
"repetition_penalty": 1.1,
|
| 12 |
-
"transformers_version": "4.51.3"
|
|
|
|
|
|
|
| 13 |
}
|
|
|
|
| 9 |
"do_sample": false,
|
| 10 |
"num_beams": 1,
|
| 11 |
"repetition_penalty": 1.1,
|
| 12 |
+
"transformers_version": "4.51.3",
|
| 13 |
+
"top_k": 1,
|
| 14 |
+
"temperature": 0
|
| 15 |
}
|
tokenizer_config.json
CHANGED
|
@@ -18820,5 +18820,6 @@
|
|
| 18820 |
"truncation_side": "right",
|
| 18821 |
"truncation_strategy": "longest_first",
|
| 18822 |
"unk_token": "<unk>",
|
| 18823 |
-
"vocab_file": null
|
|
|
|
| 18824 |
}
|
|
|
|
| 18820 |
"truncation_side": "right",
|
| 18821 |
"truncation_strategy": "longest_first",
|
| 18822 |
"unk_token": "<unk>",
|
| 18823 |
+
"vocab_file": null,
|
| 18824 |
+
"chat_template": "{%- for message in messages -%}{%- for part in message['content'] -%}{{ part['text'] if part['type'] == 'text' else '' }}{%- endfor -%}{%- endfor -%}"
|
| 18825 |
}
|