# Edit Distance Distribution

## Our dataset

In [3]:
from generation_steps.metrics_analysis import edit_distance_fn
from datasets import load_dataset


df = load_dataset("JetBrains-Research/synthetic-commit-msg-edits", "all_pairs", split="train").to_pandas()
df_edit_distance = {"Full": [edit_distance_fn(pred=row["G_text"], ref=row["E_text"]) for _, row in df.loc[df.is_related].iterrows()],
                    "Synthetic Backward":  [edit_distance_fn(pred=row["G_text"], ref=row["E_text"]) for _, row in df.loc[df.is_related & (df.G_type == "synthetic_backward") & (df.E_type == "expert_labeled")].iterrows()],
                    "Synthetic Forward": [edit_distance_fn(pred=row["G_text"], ref=row["E_text"]) for _, row in df.loc[df.is_related & (df.G_type == "initial") & (df.E_type == "synthetic_forward")].iterrows()] + [edit_distance_fn(pred=row["G_text"], ref=row["E_text"]) for _, row in df.loc[df.is_related & (df.G_type == "synthetic_backward") & (df.E_type == "synthetic_forward_from_backward")].iterrows()],
                    "Expert-labeled": [edit_distance_fn(pred=row["G_text"], ref=row["E_text"]) for _, row in df.loc[df.is_related & (df.G_type == "initial") & (df.E_type == "expert_labeled")].iterrows()]
                    }

colors = {"Expert-labeled": "#C19C0B",
          "Synthetic Backward": "#913632",
          "Synthetic Forward": "#58136a",
          "Full": "#000000"}

## FUS data

In [5]:
import pandas as pd


PROD_LENGTH_RATIO = 1.772691712591923


df = pd.read_csv("data/fus_pycharm_fixed.csv")

bin_centers = []
bin_widths = []
for value in df["Metric Value"]:
    if len(value.split('-')) == 2:
        lower, upper = map(float, value.split('-'))
        center = (lower + upper) / 2
        width = upper - lower
        bin_centers.append(center * PROD_LENGTH_RATIO)
        bin_widths.append(width * PROD_LENGTH_RATIO)
    else:
        center = float(value)
        width = 1
        bin_centers.append(center * PROD_LENGTH_RATIO)
        bin_widths.append(width * PROD_LENGTH_RATIO)
df["center"] = bin_centers
df["width"] = bin_widths

## Chart

In [7]:
import numpy as np
from scipy.stats import gaussian_kde
import plotly.graph_objects as go

df = df.iloc[1:]
traces = [go.Bar(
        x=df["center"],
        y=df["Events"] / (df["Events"].sum() * df["width"]),
        width=df["width"],
        name='PyCharm logs (scaled)',
        opacity=0.75,
        marker=dict(color='#1bd88a'))]

for key in df_edit_distance:
    kde_x = np.linspace(0, 1200, 1000)
    kde = gaussian_kde(df_edit_distance[key])
    kde_line = go.Scatter(
        x=kde_x, 
        y=kde(kde_x), 
        mode='lines', 
        name=key,
        line=dict(color=colors[key], width=5)
    )   
    traces.append(kde_line)

fig = go.Figure(data=traces)

fig.update_layout(
    bargap=0.1,
    xaxis=dict(
        title=dict(text="Edit Distance", font=dict(size=30)),
        range=[0, 1200],
        showgrid=True,
        gridcolor='lightgrey'
    ),
    yaxis=dict(
        title=dict(text="Probability Density", font=dict(size=30)),
        range=[0, 0.004],
        showgrid=True,
        gridcolor='lightgrey',
        tickvals=[0.0005, 0.001, 0.0015, 0.002, 0.0025, 0.003, 0.0035, 0.004],
        tickformat=".4f"
    ),
    plot_bgcolor='rgba(0,0,0,0)',
    paper_bgcolor='rgba(0,0,0,0)',
    font=dict(size=24),
    legend=dict(font=dict(size=30)),
    width=1600,
    height=600,
)

fig.show()

In [4]:
fig.write_image("ed_distribution.pdf")