Quantifying the Carbon Emissions of Machine Learning
Paper
•
1910.09700
•
Published
•
23
This model serves as a demonstration of how fine-tuning foundational models using the Neo4j-Text2Cypher(2024) Dataset (link) can enhance performance on the Text2Cypher task.
Please note, this is part of ongoing research and exploration, aimed at highlighting the dataset's potential rather than a production-ready solution.
Base model: google/gemma-2-9b-it
Dataset: neo4j/text2cypher-2024v1
An overview of the finetuned models and benchmarking results are shared at Link1 and Link2
Have ideas or insights? Contact us: Neo4j/Team-GenAI
We need to be cautious about a few risks:
Also check the related blogpost:Link
Used RunPod with following setup:
from peft import PeftModel, PeftConfig
import torch
from transformers import (
AutoModelForCausalLM,
AutoTokenizer,
BitsAndBytesConfig,
)
instruction = (
"Generate Cypher statement to query a graph database. "
"Use only the provided relationship types and properties in the schema. \n"
"Schema: {schema} \n Question: {question} \n Cypher output: "
)
def prepare_chat_prompt(question, schema) -> list[dict]:
chat = [
{
"role": "user",
"content": instruction.format(
schema=schema, question=question
),
}
]
return chat
def _postprocess_output_cypher(output_cypher: str) -> str:
# Remove any explanation. E.g. MATCH...\n\n**Explanation:**\n\n -> MATCH...
# Remove cypher indicator. E.g.```cypher\nMATCH...```` --> MATCH...
# Note: Possible to have both:
# E.g. ```cypher\nMATCH...````\n\n**Explanation:**\n\n --> MATCH...
partition_by = "**Explanation:**"
output_cypher, _, _ = output_cypher.partition(partition_by)
output_cypher = output_cypher.strip("`\n")
output_cypher = output_cypher.lstrip("cypher\n")
output_cypher = output_cypher.strip("`\n ")
return output_cypher
# Model
model_name = "neo4j/text2cypher-gemma-2-9b-it-finetuned-2024v1"
bnb_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_use_double_quant=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype=torch.bfloat16,
)
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
model_name,
quantization_config=bnb_config,
torch_dtype=torch.bfloat16,
attn_implementation="eager",
low_cpu_mem_usage=True,
)
# Question
question = "What are the movies of Tom Hanks?"
schema = "(:Actor)-[:ActedIn]->(:Movie)" # Check the NOTE below on creating your own schemas
new_message = prepare_chat_prompt(question=question, schema=schema)
prompt = tokenizer.apply_chat_template(new_message, add_generation_prompt=True, tokenize=False)
inputs = tokenizer(prompt, return_tensors="pt", padding=True)
# Any other parameters
model_generate_parameters = {
"top_p": 0.9,
"temperature": 0.2,
"max_new_tokens": 512,
"do_sample": True,
"pad_token_id": tokenizer.eos_token_id,
}
inputs.to(model.device)
model.eval()
with torch.no_grad():
tokens = model.generate(**inputs, **model_generate_parameters)
tokens = tokens[:, inputs.input_ids.shape[1] :]
raw_outputs = tokenizer.batch_decode(tokens, skip_special_tokens=True)
outputs = [_postprocess_output_cypher(output) for output in raw_outputs]
print(outputs)
> ["MATCH (a:Actor {Name: 'Tom Hanks'})-[:ActedIn]->(m:Movie) RETURN m"]
neo4j-graphrag package::SchemaReader functions