File size: 20,044 Bytes
ad69efd
a7284df
 
 
 
ad69efd
 
 
a7284df
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ad69efd
a7284df
 
 
 
 
 
 
 
ad69efd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a7284df
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
610ef64
 
 
 
a7284df
610ef64
 
 
 
 
 
 
 
 
 
 
 
 
a7284df
 
610ef64
 
 
 
a7284df
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ad69efd
a7284df
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
610ef64
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ad69efd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a7284df
ad69efd
a7284df
ad69efd
a7284df
ad69efd
a7284df
ad69efd
 
610ef64
ad69efd
 
 
 
a7284df
ad69efd
 
 
a7284df
 
610ef64
a7284df
 
 
610ef64
ad69efd
a7284df
ad69efd
610ef64
a7284df
ad69efd
 
 
 
610ef64
ad69efd
610ef64
ad69efd
 
 
 
a7284df
610ef64
 
 
 
ad69efd
610ef64
ad69efd
610ef64
ad69efd
610ef64
 
 
 
ad69efd
610ef64
a7284df
610ef64
a7284df
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
import io, math, json, gzip
import numpy as np
import pandas as pd
import gradio as gr

# -------------------------------
# Core metric helpers
# -------------------------------
def shannon_entropy_from_counts(counts: np.ndarray) -> float:
    counts = counts.astype(float)
    total = counts.sum()
    if total <= 0:
        return 0.0
    p = counts / total
    p = p[p > 0]
    return float(-(p * np.log2(p)).sum())

def numeric_binned_entropy(series: pd.Series, bins: int = 32):
    x = series.dropna().astype(float).values
    if x.size == 0:
        return 0.0, 0
    try:
        qs = np.linspace(0, 1, bins + 1)
        edges = np.unique(np.nanpercentile(x, qs * 100))
        if len(edges) < 2:
            edges = np.unique(x)
        hist, _ = np.histogram(x, bins=edges)
    except Exception:
        hist, _ = np.histogram(x, bins=bins)
    H = shannon_entropy_from_counts(hist)
    k = np.count_nonzero(hist)
    return H, max(k, 1)

def categorical_entropy(series: pd.Series):
    x = series.dropna().astype(str).values
    if x.size == 0:
        return 0.0, 0
    vals, counts = np.unique(x, return_counts=True)
    H = shannon_entropy_from_counts(counts)
    return H, len(vals)

def monotone_runs_and_entropy(series: pd.Series):
    x = series.dropna().values
    n = len(x)
    if n <= 1:
        return 1, 0.0
    runs = [1]
    for i in range(1, n):
        if x[i] >= x[i-1]:
            runs[-1] += 1
        else:
            runs.append(1)
    run_lengths = np.array(runs, dtype=float)
    H = shannon_entropy_from_counts(run_lengths)
    return len(runs), H

def sortedness_score(series: pd.Series) -> float:
    x = series.dropna().values
    if len(x) <= 1:
        return 1.0
    return float(np.mean(np.diff(x) >= 0))

def gzip_compress_ratio_from_bytes(b: bytes) -> float:
    if len(b) == 0:
        return 1.0
    out = io.BytesIO()
    with gzip.GzipFile(fileobj=out, mode="wb") as f:
        f.write(b)
    compressed = out.getvalue()
    return len(compressed) / len(b)

def dataframe_gzip_ratio(df: pd.DataFrame, max_rows: int = 20000) -> float:
    s = df.sample(min(len(df), max_rows), random_state=0) if len(df) > max_rows else df
    raw = s.to_csv(index=False).encode("utf-8", errors="ignore")
    return gzip_compress_ratio_from_bytes(raw)

def pareto_maxima_count(points: np.ndarray) -> int:
    if points.shape[1] < 2 or points.shape[0] == 0:
        return 0
    P = points[:, :2]
    order = np.lexsort((-P[:, 1], -P[:, 0]))
    best_y = -np.inf
    count = 0
    for idx in order:
        y = P[idx, 1]
        if y >= best_y:
            count += 1
            best_y = y
    return int(count)

def kd_entropy(points: np.ndarray, max_leaf: int = 128, axis: int = 0) -> float:
    n = points.shape[0]
    if n == 0:
        return 0.0
    if n <= max_leaf:
        return 0.0
    vals = points[:, axis]
    med = np.median(vals)
    left = points[vals <= med]
    right = points[vals > med]
    pL = len(left) / n
    pR = len(right) / n
    H_here = 0.0
    for p in (pL, pR):
        if p > 0:
            H_here += -p * math.log(p, 2)
    next_axis = (axis + 1) % points.shape[1]
    return H_here + kd_entropy(left, max_leaf, next_axis) + kd_entropy(right, max_leaf, next_axis)

def normalize(value: float, max_value: float) -> float:
    if max_value <= 0:
        return 0.0
    v = max(0.0, min(1.0, value / max_value))
    return float(v)

# -------------------------------
# Scoring + interpretations
# -------------------------------
def grade_band(value: float, thresholds: list, labels: list):
    """Generic banding helper: thresholds ascending; returns (label_idx, label)."""
    for i, t in enumerate(thresholds):
        if value <= t:
            return i, labels[i]
    return len(labels)-1, labels[-1]

def interpret_report(report: dict) -> dict:
    """Produce human-friendly interpretations with color badges and advice."""
    r, c = report["shape"]["rows"], report["shape"]["cols"]
    max_bits = math.log2(max(2, r))

    # Harvestable Energy (0..1)
    he = report.get("harvestable_energy_score", 0.0)
    he_pct = round(100 * he)
    he_idx, he_label = grade_band(1.0 - he, [0.15, 0.35, 0.6, 0.85],  # invert so higher is better
                                  ["Excellent", "High", "Moderate", "Low", "Very Low"])
    he_color = ["#10b981", "#34d399", "#f59e0b", "#f97316", "#ef4444"][he_idx]

    # Gzip ratio (lower is better)
    gz = report.get("gzip_compression_ratio", 1.0)
    gz_idx, gz_label = grade_band(gz, [0.45, 0.7, 0.9, 1.1], ["Highly compressible", "Compressible", "Some structure", "Low structure", "Unstructured"])
    gz_color = ["#10b981", "#34d399", "#f59e0b", "#f97316", "#ef4444"][gz_idx]

    # kd-entropy (lower is better). Normalize by log2(n)
    Hkd = float(report.get("kd_partition_entropy_bits", 0.0))
    Hkd_norm = normalize(Hkd, max_bits)
    kd_idx, kd_label = grade_band(Hkd_norm, [0.15, 0.3, 0.5, 0.75], ["Simple spatial blocks", "Moderately simple", "Mixed", "Complex", "Highly complex"])
    kd_color = ["#10b981", "#34d399", "#f59e0b", "#f97316", "#ef4444"][kd_idx]

    # Run-entropy / Sortedness aggregation for numeric columns
    per_col = report.get("per_column", {})
    run_H = []
    sorted_fracs = []
    for col, st in per_col.items():
        if "run_entropy_bits" in st:
            run_H.append(st["run_entropy_bits"])
            sorted_fracs.append(st.get("sortedness_fraction", 0.0))
    if run_H:
        runH_mean = float(np.mean(run_H))
        runH_norm = normalize(runH_mean, max_bits)
        sort_mean = float(np.mean(sorted_fracs)) if sorted_fracs else 0.0
    else:
        runH_norm = 1.0
        sort_mean = 0.0

    run_idx, run_label = grade_band(runH_norm, [0.15, 0.3, 0.5, 0.75], ["Long smooth runs", "Mostly smooth", "Mixed runs", "Choppy", "Highly choppy"])
    run_color = ["#10b981", "#34d399", "#f59e0b", "#f97316", "#ef4444"][run_idx]

    sort_idx, sort_label = grade_band(1.0 - sort_mean, [0.15, 0.3, 0.5, 0.75], ["Highly sorted", "Mostly sorted", "Partially sorted", "Barely sorted", "Unsorted"])
    sort_color = ["#10b981", "#34d399", "#f59e0b", "#f97316", "#ef4444"][sort_idx]

    # Duplicate rows
    dup = report.get("duplicate_row_fraction", 0.0)
    dup_idx, dup_label = grade_band(dup, [0.01, 0.05, 0.15, 0.3], ["Clean", "Light dups", "Moderate dups", "High dups", "Very high dups"])
    dup_color = ["#10b981", "#34d399", "#f59e0b", "#f97316", "#ef4444"][dup_idx]

    # Recommendations (simple rule-based)
    recs = []
    if he >= 0.7:
        recs.append("Leverage **adaptive algorithms** (TimSort-style merges, linear hull/skyline passes) for near-linear performance.")
    elif he >= 0.4:
        recs.append("Consider **light preprocessing** (bucketing, dedupe) to unlock more adaptive speedups.")
    else:
        recs.append("Expect **near worst-case costs**; use robust algorithms and consider feature engineering/cleaning.")

    if gz <= 0.7:
        recs.append("Data is **highly compressible** β†’ try dictionary/columnar encoding and caching to cut memory/IO.")
    elif gz >= 1.0:
        recs.append("Data is **hard to compress** β†’ prioritize dimensionality reduction or noise filtering.")

    if runH_norm <= 0.3 or sort_mean >= 0.7:
        recs.append("Columns show **long monotone runs** β†’ merges and single-pass scans will be efficient.")
    else:
        recs.append("Columns are **choppy** β†’ batch/aggregate before sorting to reduce comparisons.")

    if Hkd_norm <= 0.3:
        recs.append("Spatial structure is **simple** β†’ kd/quad trees will be shallow; range queries will be fast.")
    elif Hkd_norm >= 0.6:
        recs.append("Spatial structure is **complex** β†’ consider clustering/tiling before building indexes.")

    if dup >= 0.05:
        recs.append("De-duplicate rows to lower entropy and improve compression & joins.")

    # Summary verdict
    verdict = ["Outstanding structure for fast algorithms.",
               "Strong latent order; plenty of speed to harvest.",
               "Mixed: some order present; moderate gains possible.",
               "Low order; focus on cleaning and feature engineering.",
               "Chaotic: assume worst-case runtimes."][he_idx]

    return {
        "he": {"pct": he_pct, "label": he_label, "color": he_color},
        "gzip": {"value": gz, "label": gz_label, "color": gz_color},
        "kd": {"value": Hkd, "label": kd_label, "color": kd_color},
        "runs": {"value": runH_norm, "label": run_label, "color": run_color},
        "sorted": {"value": sort_mean, "label": sort_label, "color": sort_color},
        "dup": {"value": dup, "label": dup_label, "color": dup_color},
        "verdict": verdict,
        "recs": recs[:6]
    }

# -------------------------------
# Compute metrics
# -------------------------------
def compute_metrics(df: pd.DataFrame) -> dict:
    report = {}
    n_rows, n_cols = df.shape
    report["shape"] = {"rows": int(n_rows), "cols": int(n_cols)}

    # Types
    types = {}
    for c in df.columns:
        s = df[c]
        if pd.api.types.is_numeric_dtype(s):
            types[c] = "numeric"
        elif pd.api.types.is_datetime64_any_dtype(s) or "date" in str(s.dtype).lower():
            types[c] = "datetime"
        else:
            types[c] = "categorical"
    report["column_types"] = types

    missing = df.isna().mean().to_dict()
    dup_ratio = float((len(df) - len(df.drop_duplicates())) / max(1, len(df)))
    report["missing_fraction_per_column"] = {k: float(v) for k, v in missing.items()}
    report["duplicate_row_fraction"] = dup_ratio

    col_stats = {}
    for c in df.columns:
        s = df[c]
        if types[c] == "numeric":
            H, k = numeric_binned_entropy(s)
            runs, Hruns = monotone_runs_and_entropy(s)
            sorted_frac = sortedness_score(s)
            col_stats[c] = {
                "entropy_binned_bits": float(H),
                "active_bins": int(k),
                "monotone_runs": int(runs),
                "run_entropy_bits": float(Hruns),
                "sortedness_fraction": float(sorted_frac),
                "min": float(np.nanmin(s.values)) if s.dropna().shape[0] else None,
                "max": float(np.nanmax(s.values)) if s.dropna().shape[0] else None,
                "mean": float(np.nanmean(s.values)) if s.dropna().shape[0] else None,
                "std": float(np.nanstd(s.values)) if s.dropna().shape[0] else None,
            }
        elif types[c] == "datetime":
            try:
                sd = pd.to_datetime(s, errors="coerce")
                min_dt = sd.min()
                max_dt = sd.max()
                col_stats[c] = {
                    "entropy_bits": 0.0,
                    "unique_values": int(sd.nunique(dropna=True)),
                    "min_datetime": None if pd.isna(min_dt) else min_dt.isoformat(),
                    "max_datetime": None if pd.isna(max_dt) else max_dt.isoformat(),
                }
            except Exception:
                col_stats[c] = {"entropy_bits": 0.0, "unique_values": int(s.nunique(dropna=True))}
        else:
            H, k = categorical_entropy(s)
            # top-5 categories
            vc = s.astype(str).value_counts(dropna=True).head(5)
            top5 = [{"value": str(idx), "count": int(cnt)} for idx, cnt in vc.items()]
            col_stats[c] = {"entropy_bits": float(H), "unique_values": int(k), "top_values": top5}
    report["per_column"] = col_stats

    try:
        gzip_ratio = dataframe_gzip_ratio(df)
    except Exception:
        gzip_ratio = 1.0
    report["gzip_compression_ratio"] = float(gzip_ratio)

    num_cols = [c for c, t in types.items() if t == "numeric"]
    if len(num_cols) >= 2:
        X = df[num_cols].select_dtypes(include=[np.number]).values.astype(float)
        X = X[~np.isnan(X).any(axis=1)]
        if X.shape[0] >= 3:
            pts2 = X[:, :2]
            report["pareto_maxima_2d"] = int(pareto_maxima_count(pts2))
            try:
                H_kd = kd_entropy(pts2, max_leaf=128, axis=0)
            except Exception:
                H_kd = 0.0
            report["kd_partition_entropy_bits"] = float(H_kd)
        else:
            report["pareto_maxima_2d"] = 0
            report["kd_partition_entropy_bits"] = 0.0
    else:
        report["pareto_maxima_2d"] = 0
        report["kd_partition_entropy_bits"] = 0.0

    # Harvestable Energy
    max_bits = math.log2(max(2, n_rows))
    he_parts = []
    he_parts.append(1.0 - max(0.0, min(1.0, report["gzip_compression_ratio"])))
    num_run_entropies = []
    for c in df.columns:
        st = col_stats.get(c, {})
        if "run_entropy_bits" in st:
            num_run_entropies.append(st["run_entropy_bits"])
    if num_run_entropies:
        mean_run_H = float(np.mean(num_run_entropies))
        he_parts.append(1.0 - normalize(mean_run_H, max_bits))
    H_kd = report.get("kd_partition_entropy_bits", 0.0)
    if H_kd is not None:
        he_parts.append(1.0 - normalize(float(H_kd), max_bits))
    if he_parts:
        HE = float(np.mean([max(0.0, min(1.0, v)) for v in he_parts]))
    else:
        HE = 0.0
    report["harvestable_energy_score"] = HE

    return report

# -------------------------------
# Dataset shape summary for other models
# -------------------------------
def dataset_shape_summary(df: pd.DataFrame, report: dict, max_examples: int = 3) -> dict:
    """Compact JSON describing the dataset schema, ranges, and examples for LLM ingestion."""
    cols = []
    for name, t in report["column_types"].items():
        col_info = {"name": name, "type": t}
        per = report["per_column"].get(name, {})
        if t == "numeric":
            col_info.update({
                "min": per.get("min"),
                "max": per.get("max"),
                "mean": per.get("mean"),
                "std": per.get("std"),
                "missing_frac": report["missing_fraction_per_column"].get(name, 0.0)
            })
        elif t == "datetime":
            col_info.update({
                "min": per.get("min_datetime"),
                "max": per.get("max_datetime"),
                "missing_frac": report["missing_fraction_per_column"].get(name, 0.0)
            })
        else:  # categorical or other
            col_info.update({
                "unique_values": per.get("unique_values"),
                "top_values": per.get("top_values", []),
                "missing_frac": report["missing_fraction_per_column"].get(name, 0.0)
            })
        cols.append(col_info)

    # few example rows (stringified to be safe)
    examples = df.head(max_examples).astype(str).to_dict(orient="records")

    shape = {
        "n_rows": report["shape"]["rows"],
        "n_cols": report["shape"]["cols"],
        "columns": cols,
        "duplicates_fraction": report.get("duplicate_row_fraction", 0.0),
        "gzip_compression_ratio": report.get("gzip_compression_ratio", None),
        "harvestable_energy_score": report.get("harvestable_energy_score", None),
        "examples": examples
    }
    return shape

# -------------------------------
# UI rendering helpers
# -------------------------------
def badge(text: str, color: str) -> str:
    return f"<span style='background:{color};color:white;padding:6px 10px;border-radius:999px;font-weight:600'>{text}</span>"

def metric_card(title: str, value: str, badge_html: str) -> str:
    return f"""
    <div style="flex:1;min-width:220px;border:1px solid #e5e7eb;border-radius:14px;padding:14px 16px;">
        <div style="font-size:14px;color:#6b7280;margin-bottom:8px">{title}</div>
        <div style="font-size:22px;font-weight:700;margin-bottom:10px">{value}</div>
        {badge_html}
    </div>
    """

def render_dashboard(report: dict, interp: dict) -> str:
    he = interp["he"]
    gz = interp["gzip"]
    kd = interp["kd"]
    runs = interp["runs"]
    sortb = interp["sorted"]
    dup = interp["dup"]

    cards = []
    cards.append(metric_card("Harvestable Energy", f"{he['pct']} / 100", badge(he['label'], he['color'])))
    cards.append(metric_card("Compressibility (gzip)", f"{gz['value']:.3f}", badge(gz['label'], gz['color'])))
    cards.append(metric_card("Range-Partition Entropy (kd bits)", f"{kd['value']:.3f}", badge(kd['label'], kd['color'])))
    cards.append(metric_card("Run-Entropy (avg, normalized)", f"{runs['value']:.2f}", badge(runs['label'], runs['color'])))
    cards.append(metric_card("Sortedness (avg fraction)", f"{sortb['value']:.2f}", badge(sortb['label'], sortb['color'])))
    cards.append(metric_card("Duplicate Rows (fraction)", f"{dup['value']:.2f}", badge(dup['label'], dup['color'])))

    grid = "<div style='display:flex;flex-wrap:wrap;gap:12px'>" + "".join(cards) + "</div>"
    verdict = f"<div style='margin-top:12px;padding:14px 16px;background:#f9fafb;border:1px solid #e5e7eb;border-radius:14px'><b>Verdict:</b> {interp['verdict']}</div>"
    return grid + verdict

def render_recs(interp: dict) -> str:
    lis = "".join([f"<li>{r}</li>" for r in interp["recs"]])
    return f"<ul>{lis}</ul>"

def render_columns(report: dict) -> str:
    rows = []
    for c, st in report.get("per_column", {}).items():
        miss = report["missing_fraction_per_column"].get(c, 0.0)
        if "entropy_binned_bits" in st:
            rows.append(f"<tr><td><b>{c}</b> (num)</td><td>{miss:.1%}</td><td>{st['entropy_binned_bits']:.2f}</td><td>{st['monotone_runs']}</td><td>{st['run_entropy_bits']:.2f}</td><td>{st['sortedness_fraction']:.2f}</td></tr>")
        elif "entropy_bits" in st:
            rows.append(f"<tr><td><b>{c}</b> (cat)</td><td>{miss:.1%}</td><td>{st['entropy_bits']:.2f}</td><td>-</td><td>-</td><td>-</td></tr>")
        else:
            rows.append(f"<tr><td><b>{c}</b></td><td>{miss:.1%}</td><td>-</td><td>-</td><td>-</td><td>-</td></tr>")
    header = "<tr><th>Column</th><th>Missing</th><th>Entropy</th><th>Monotone Runs</th><th>Run-Entropy</th><th>Sortedness</th></tr>"
    table = "<table style='width:100%;border-collapse:collapse'>" + header + "".join(rows) + "</table>"
    table = table.replace("<tr>", "<tr style='border-bottom:1px solid #e5e7eb'>")
    table = table.replace("<th>", "<th style='text-align:left;padding:8px 6px;color:#374151'>")
    table = table.replace("<td>", "<td style='padding:8px 6px;color:#111827'>")
    return table

# -------------------------------
# Gradio app
# -------------------------------
def analyze(file):
    if file is None:
        return "{}", "Please upload a CSV.", "", "", "{}"
    try:
        df = pd.read_csv(file.name)
    except Exception as e:
        return "{}", f"Failed to read CSV: {e}", "", "", "{}"

    report = compute_metrics(df)
    interp = interpret_report(report)
    shape = dataset_shape_summary(df, report, max_examples=3)

    report_json = json.dumps(report, indent=2)
    dashboard_html = render_dashboard(report, interp)
    recs_html = render_recs(interp)
    cols_html = render_columns(report)
    shape_json = json.dumps(shape, indent=2)

    return report_json, dashboard_html, recs_html, cols_html, shape_json

with gr.Blocks(title="OrderLens β€” Data Interpreter") as demo:
    gr.Markdown("# OrderLens β€” Data Interpreter")
    gr.Markdown("Upload a CSV and get **readable** structure metrics with plain-language guidance.")
    with gr.Row():
        inp = gr.File(file_types=[".csv"], label="CSV file")
    btn = gr.Button("Analyze", variant="primary")
    gr.Markdown("---")
    gr.Markdown("### Dashboard")  # color-coded cards + verdict
    dash = gr.HTML()
    gr.Markdown("### Recommendations")  # actionable tips
    recs = gr.HTML()
    gr.Markdown("### Column Details")  # per-column table
    cols = gr.HTML()
    gr.Markdown("### Dataset Shape Summary (JSON)")  # compact schema for other models
    shape_out = gr.Code(label="Shape", language="json")
    gr.Markdown("### Raw report (JSON)")  # API-friendly
    json_out = gr.Code(label="Report", language="json")

    btn.click(analyze, inputs=inp, outputs=[json_out, dash, recs, cols, shape_out])

if __name__ == "__main__":
    demo.launch()