Spaces:

jsulz
/

repo-info

Running

App Files Files Community

jsulz commited on Dec 5, 2024

Commit

9d6f412

1 Parent(s): 34ae673

wrapping

Browse files

Files changed (1) hide show

app.py +42 -34

app.py CHANGED Viewed

@@ -17,8 +17,10 @@ def apply_power_scaling(sizes: list, exponent=0.2) -> list:
     return [size**exponent if size is not None else 0 for size in sizes]
-def count_chunks(sizes: list) -> list:
     """Count the number of chunks, which are 64KB each in size; always roundup"""
     return [int(np.ceil(size / 64_000)) if size is not None else 0 for size in sizes]
@@ -99,7 +101,7 @@ def flatten_hierarchy(hierarchy, root_name="Repository"):
     return labels, parents, sizes, ids
-def visualize_repo_treemap(r_info):
     """Visualizes the repository as a treemap with directory sizes and human-readable tooltips."""
     siblings = r_info.siblings
     hierarchy = build_hierarchy(siblings)
@@ -108,7 +110,7 @@ def visualize_repo_treemap(r_info):
     calculate_directory_sizes(hierarchy)
     # Flatten the hierarchy for Plotly
-    labels, parents, sizes, ids = flatten_hierarchy(hierarchy)
     # Scale for vix
     scaled_sizes = apply_power_scaling(sizes)
@@ -138,7 +140,7 @@ def visualize_repo_treemap(r_info):
         values=scaled_sizes,
         color=normalized_colors,
         color_continuous_scale=colorscale,
-        title="Repo by Chunks",
         custom_data=[formatted_sizes, chunks],
         height=1000,
         ids=ids,
@@ -149,7 +151,7 @@ def visualize_repo_treemap(r_info):
     # Add subtitle by updating the layout
     fig.update_layout(
         title={
-            "text": "Repo File Size Treemap<br><span style='font-size:14px;'>Hover over each directory or file to see the size of the file and its number of chunks</span>",
             "x": 0.5,
             "xanchor": "center",
         },
@@ -189,16 +191,18 @@ def format_repo_size(r_size: int) -> str:
 def repo_files(r_type: str, r_id: str) -> dict:
     r_info = HF_API.repo_info(repo_id=r_id, repo_type=r_type, files_metadata=True)
-    fig = visualize_repo_treemap(r_info)
     files = {}
     for sibling in r_info.siblings:
         ext = sibling.rfilename.split(".")[-1]
         if ext in files:
             files[ext]["size"] += sibling.size
             files[ext]["count"] += 1
         else:
             files[ext] = {}
             files[ext]["size"] = sibling.size
             files[ext]["count"] = 1
     return files, fig
@@ -226,7 +230,11 @@ def repo_size(r_type, r_id):
             return {}
         size = response.get("size")
         if size is not None:
-            repo_sizes[branch.name] = size
     return repo_sizes
@@ -246,40 +254,32 @@ def get_repo_info(r_type, r_id):
             gr.Dataframe(visible=False),
         )
-    rf_sizes_df = (
-        pd.DataFrame(repo_files_info)
-        .T.reset_index(names="ext")
-        .sort_values(by="size", ascending=False)
-    )
     # check if repo_sizes is just {}
     if not repo_sizes:
         r_sizes_component = gr.Dataframe(visible=False)
         b_block = gr.Row(visible=False)
     else:
-        r_sizes_df = pd.DataFrame(repo_sizes, index=["size"]).T.reset_index(
-            names="branch"
         )
-        r_sizes_df["formatted_size"] = r_sizes_df["size"].apply(format_repo_size)
-        r_sizes_df.columns = ["Branch", "bytes", "Size"]
         r_sizes_component = gr.Dataframe(
-            value=r_sizes_df[["Branch", "Size"]], visible=True
         )
         b_block = gr.Row(visible=True)
-    rf_sizes_df["formatted_size"] = rf_sizes_df["size"].apply(format_repo_size)
-    rf_sizes_df.columns = ["Extension", "bytes", "Count", "Size"]
-    rf_sizes_plot = px.pie(
-        rf_sizes_df,
-        values="bytes",
-        names="Extension",
-        hover_data=["Size"],
-        title=f"File Distribution in {r_id}",
-        hole=0.3,
     )
     return (
         gr.Row(visible=True),
         gr.Dataframe(
-            value=rf_sizes_df[["Extension", "Count", "Size"]],
             visible=True,
         ),
         # gr.Plot(rf_sizes_plot, visible=True),
@@ -290,9 +290,9 @@ def get_repo_info(r_type, r_id):
 with gr.Blocks(theme="ocean") as demo:
-    gr.Markdown("# Repository Information")
     gr.Markdown(
-        "Search for a model or dataset repository using the autocomplete below, select the repository type, and get back information about the repository's files and branches."
     )
     with gr.Blocks():
         # repo_id = gr.Textbox(label="Repository ID", placeholder="123456")
@@ -310,15 +310,23 @@ with gr.Blocks(theme="ocean") as demo:
     with gr.Blocks():
         with gr.Row(visible=False) as results_block:
             with gr.Column():
-                gr.Markdown("## File Information")
                 file_info_plot = gr.Plot(visible=False)
-                with gr.Row():
-                    file_info = gr.Dataframe(visible=False)
-                    # file_info_plot = gr.Plot(visible=False)
                 with gr.Row(visible=False) as branch_block:
                     with gr.Column():
-                        gr.Markdown("## Branch Sizes")
                         branch_sizes = gr.Dataframe(visible=False)
     search_button.click(
         get_repo_info,

     return [size**exponent if size is not None else 0 for size in sizes]
+def count_chunks(sizes: list | int) -> list:
     """Count the number of chunks, which are 64KB each in size; always roundup"""
+    if isinstance(sizes, int):
+        return int(np.ceil(sizes / 64_000))
     return [int(np.ceil(size / 64_000)) if size is not None else 0 for size in sizes]
     return labels, parents, sizes, ids
+def visualize_repo_treemap(r_info: dict, r_id: str) -> px.treemap:
     """Visualizes the repository as a treemap with directory sizes and human-readable tooltips."""
     siblings = r_info.siblings
     hierarchy = build_hierarchy(siblings)
     calculate_directory_sizes(hierarchy)
     # Flatten the hierarchy for Plotly
+    labels, parents, sizes, ids = flatten_hierarchy(hierarchy, r_id)
     # Scale for vix
     scaled_sizes = apply_power_scaling(sizes)
         values=scaled_sizes,
         color=normalized_colors,
         color_continuous_scale=colorscale,
+        title=f"{r_id} by Chunks",
         custom_data=[formatted_sizes, chunks],
         height=1000,
         ids=ids,
     # Add subtitle by updating the layout
     fig.update_layout(
         title={
+            "text": f"{r_id} file and chunk treemap<br><span style='font-size:14px;'>Hover over each directory/file to see its size and number of chunks it contains.</span>",
             "x": 0.5,
             "xanchor": "center",
         },
 def repo_files(r_type: str, r_id: str) -> dict:
     r_info = HF_API.repo_info(repo_id=r_id, repo_type=r_type, files_metadata=True)
+    fig = visualize_repo_treemap(r_info, r_id)
     files = {}
     for sibling in r_info.siblings:
         ext = sibling.rfilename.split(".")[-1]
         if ext in files:
             files[ext]["size"] += sibling.size
+            files[ext]["chunks"] += count_chunks(sibling.size)
             files[ext]["count"] += 1
         else:
             files[ext] = {}
             files[ext]["size"] = sibling.size
+            files[ext]["chunks"] = count_chunks(sibling.size)
             files[ext]["count"] = 1
     return files, fig
             return {}
         size = response.get("size")
         if size is not None:
+            repo_sizes[branch.name] = {
+                "size_in_bytes": size,
+                "size_in_chunks": count_chunks(size),
+            }
     return repo_sizes
             gr.Dataframe(visible=False),
         )
     # check if repo_sizes is just {}
     if not repo_sizes:
         r_sizes_component = gr.Dataframe(visible=False)
         b_block = gr.Row(visible=False)
     else:
+        r_sizes_df = pd.DataFrame(repo_sizes).T.reset_index(names="branch")
+        r_sizes_df["formatted_size"] = r_sizes_df["size_in_bytes"].apply(
+            format_repo_size
         )
+        r_sizes_df.columns = ["Branch", "size_in_bytes", "Chunks", "Size"]
         r_sizes_component = gr.Dataframe(
+            value=r_sizes_df[["Branch", "Size", "Chunks"]], visible=True
         )
         b_block = gr.Row(visible=True)
+    rf_sizes_df = (
+        pd.DataFrame(repo_files_info)
+        .T.reset_index(names="ext")
+        .sort_values(by="size", ascending=False)
     )
+    rf_sizes_df["formatted_size"] = rf_sizes_df["size"].apply(format_repo_size)
+    rf_sizes_df.columns = ["Extension", "bytes", "Chunks", "Count", "Size"]
     return (
         gr.Row(visible=True),
         gr.Dataframe(
+            value=rf_sizes_df[["Extension", "Count", "Size", "Chunks"]],
             visible=True,
         ),
         # gr.Plot(rf_sizes_plot, visible=True),
 with gr.Blocks(theme="ocean") as demo:
+    gr.Markdown("# Chunking Repos")
     gr.Markdown(
+        "Search for a model or dataset repository using the autocomplete below, select the repository type, and get back information about the repository's contents including the [number of chunks each file might be split into with Xet backed storage](https://huggingface.co/blog/from-files-to-chunks)."
     )
     with gr.Blocks():
         # repo_id = gr.Textbox(label="Repository ID", placeholder="123456")
     with gr.Blocks():
         with gr.Row(visible=False) as results_block:
             with gr.Column():
+                gr.Markdown("## Repo Info")
                 file_info_plot = gr.Plot(visible=False)
                 with gr.Row(visible=False) as branch_block:
                     with gr.Column():
+                        gr.Markdown("### Branch Sizes")
+                        gr.Markdown(
+                            "The size of each branch in the repository and how many chunks it might need (assuming no dedupe)."
+                        )
                         branch_sizes = gr.Dataframe(visible=False)
+                with gr.Row():
+                    with gr.Column():
+                        gr.Markdown("### File Sizes")
+                        gr.Markdown(
+                            "The cumulative size of each filetype in the repository (in the `main` branch) and how many chunks they might need (assuming no dedupe)."
+                        )
+                        file_info = gr.Dataframe(visible=False)
+                    # file_info_plot = gr.Plot(visible=False)
     search_button.click(
         get_repo_info,