Spaces:

jsulz
/

repo-info

Running

App Files Files Community

jsulz commited on Dec 5, 2024

Commit

34ae673

1 Parent(s): adbb8fc

needs cleanup, but most of the treemap is working correctly

Browse files

Files changed (1) hide show

app.py +65 -33

app.py CHANGED Viewed

@@ -12,19 +12,17 @@ import numpy as np
 HF_API = HfApi()
-def apply_power_scaling(sizes, exponent=0.2):
     """Apply custom power scaling to the sizes."""
-    """skip over if size is none, but make sure to fill it as 0"""
     return [size**exponent if size is not None else 0 for size in sizes]
-def count_chunks(sizes):
-    """Count the number of chunks, which are 64KB each in size - which are bytes"""
-    """always round up to the nearest chunk"""
     return [int(np.ceil(size / 64_000)) if size is not None else 0 for size in sizes]
-def build_hierarchy(siblings):
     """Builds a hierarchical structure from the list of RepoSibling objects."""
     hierarchy = defaultdict(dict)
@@ -33,9 +31,9 @@ def build_hierarchy(siblings):
         size = sibling.lfs.size if sibling.lfs else sibling.size
         current_level = hierarchy
-        for part in path_parts[:-1]:  # Traverse directories
             current_level = current_level.setdefault(part, {})
-        current_level[path_parts[-1]] = size  # Assign size to the file
     return hierarchy
@@ -45,49 +43,60 @@ def calculate_directory_sizes(hierarchy):
     total_size = 0
     for key, value in hierarchy.items():
-        if isinstance(value, dict):  # Directory
-            dir_size = calculate_directory_sizes(value)  # Recursively calculate size
             hierarchy[key] = {
                 "__size__": dir_size,
                 **value,
-            }  # Add size to directory metadata
             total_size += dir_size
-        else:  # File
             total_size += value
     return total_size
-def flatten_hierarchy_with_directory_sizes(hierarchy, root_name="Repository"):
     """Flatten a nested dictionary into Plotly-compatible treemap data with a defined root node."""
     labels = []
     parents = []
     sizes = []
     # Recursively process the hierarchy
     def process_level(current_hierarchy, current_parent):
         for key, value in current_hierarchy.items():
-            if isinstance(value, dict) and "__size__" in value:  # Directory
-                dir_size = value.pop("__size__")  # Extract directory size
                 labels.append(key)
                 parents.append(current_parent)
                 sizes.append(dir_size)
-                process_level(value, key)  # Recurse into subdirectories
-            else:  # File
                 labels.append(key)
                 parents.append(current_parent)
                 sizes.append(value)
     # Add the root node
     total_size = calculate_directory_sizes(hierarchy)
     labels.append(root_name)
-    parents.append("")  # Root has no parent
     sizes.append(total_size)
     # Process the hierarchy
     process_level(hierarchy, root_name)
-    return labels, parents, sizes
 def visualize_repo_treemap(r_info):
@@ -98,55 +107,78 @@ def visualize_repo_treemap(r_info):
     # Calculate directory sizes
     calculate_directory_sizes(hierarchy)
-    # Flatten the hierarchy into Plotly-compatible format
-    labels, parents, sizes = flatten_hierarchy_with_directory_sizes(hierarchy)
-    # Apply the chosen scaling function for visualization
     scaled_sizes = apply_power_scaling(sizes)
     # Format the original sizes using the helper function
     formatted_sizes = [
-        (
-            format_repo_size(size) if size is not None else None
-        )  # Format both files and directories
-        for size in sizes
     ]
     chunks = count_chunks(sizes)
     # Create the treemap
     fig = px.treemap(
         names=labels,
         parents=parents,
         values=scaled_sizes,
         title="Repo by Chunks",
         custom_data=[formatted_sizes, chunks],
     )
     # Add subtitle by updating the layout
     fig.update_layout(
         title={
             "text": "Repo File Size Treemap<br><span style='font-size:14px;'>Hover over each directory or file to see the size of the file and its number of chunks</span>",
-            "x": 0.5,  # Center the title and subtitle
             "xanchor": "center",
-        }
     )
-    # Customize the hover template to include directory sizes
     fig.update_traces(
         hovertemplate=(
-            "<b>%{label}</b><br>"  # File/Directory name
-            "Size: %{customdata[0]}<br>"  # Scaled size shown in treemap
-            "# of Chunks: %{customdata[1]}"  # Formatted size from custom data
         )
     )
     fig.update_traces(root_color="lightgrey")
-    fig.update_layout(margin=dict(t=50, l=25, r=25, b=25))
     return fig
 def format_repo_size(r_size: int) -> str:
     units = {0: "B", 1: "KB", 2: "MB", 3: "GB", 4: "TB", 5: "PB"}
     order = 0
     while r_size >= 1024 and order < len(units) - 1:

 HF_API = HfApi()
+def apply_power_scaling(sizes: list, exponent=0.2) -> list:
     """Apply custom power scaling to the sizes."""
     return [size**exponent if size is not None else 0 for size in sizes]
+def count_chunks(sizes: list) -> list:
+    """Count the number of chunks, which are 64KB each in size; always roundup"""
     return [int(np.ceil(size / 64_000)) if size is not None else 0 for size in sizes]
+def build_hierarchy(siblings: list) -> dict:
     """Builds a hierarchical structure from the list of RepoSibling objects."""
     hierarchy = defaultdict(dict)
         size = sibling.lfs.size if sibling.lfs else sibling.size
         current_level = hierarchy
+        for part in path_parts[:-1]:
             current_level = current_level.setdefault(part, {})
+        current_level[path_parts[-1]] = size
     return hierarchy
     total_size = 0
     for key, value in hierarchy.items():
+        if isinstance(value, dict):
+            dir_size = calculate_directory_sizes(value)
             hierarchy[key] = {
                 "__size__": dir_size,
                 **value,
+            }
             total_size += dir_size
+        else:
             total_size += value
     return total_size
+def build_full_path(current_parent, key):
+    return f"{current_parent}/{key}" if current_parent else key
+def flatten_hierarchy(hierarchy, root_name="Repository"):
     """Flatten a nested dictionary into Plotly-compatible treemap data with a defined root node."""
     labels = []
     parents = []
     sizes = []
+    ids = []
     # Recursively process the hierarchy
     def process_level(current_hierarchy, current_parent):
         for key, value in current_hierarchy.items():
+            full_path = build_full_path(current_parent, key)
+            if isinstance(value, dict) and "__size__" in value:
+                # Handle directories
+                dir_size = value.pop("__size__")
                 labels.append(key)
                 parents.append(current_parent)
                 sizes.append(dir_size)
+                ids.append(full_path)
+                process_level(value, full_path)
+            else:
+                # Handle files
                 labels.append(key)
                 parents.append(current_parent)
                 sizes.append(value)
+                ids.append(full_path)
     # Add the root node
     total_size = calculate_directory_sizes(hierarchy)
     labels.append(root_name)
+    parents.append("")
     sizes.append(total_size)
+    ids.append(root_name)
     # Process the hierarchy
     process_level(hierarchy, root_name)
+    return labels, parents, sizes, ids
 def visualize_repo_treemap(r_info):
     # Calculate directory sizes
     calculate_directory_sizes(hierarchy)
+    # Flatten the hierarchy for Plotly
+    labels, parents, sizes, ids = flatten_hierarchy(hierarchy)
+    # Scale for vix
     scaled_sizes = apply_power_scaling(sizes)
     # Format the original sizes using the helper function
     formatted_sizes = [
+        (format_repo_size(size) if size is not None else None) for size in sizes
     ]
     chunks = count_chunks(sizes)
+    colors = scaled_sizes[:]
+    colors[0] = -1
+    max_value = max(scaled_sizes)
+    normalized_colors = [value / max_value if value > 0 else 0 for value in colors]
+    # Define the colorscale; mimics the plasma scale
+    colorscale = [
+        [0.0, "#0d0887"],
+        [0.5, "#bd3786"],
+        [1.0, "#f0f921"],
+    ]
     # Create the treemap
     fig = px.treemap(
         names=labels,
         parents=parents,
         values=scaled_sizes,
+        color=normalized_colors,
+        color_continuous_scale=colorscale,
         title="Repo by Chunks",
         custom_data=[formatted_sizes, chunks],
+        height=1000,
+        ids=ids,
     )
+    fig.update_traces(marker={"colors": ["lightgrey"] + normalized_colors[1:]})
     # Add subtitle by updating the layout
     fig.update_layout(
         title={
             "text": "Repo File Size Treemap<br><span style='font-size:14px;'>Hover over each directory or file to see the size of the file and its number of chunks</span>",
+            "x": 0.5,
             "xanchor": "center",
+        },
+        coloraxis_showscale=False,
     )
+    # Customize the hover template
     fig.update_traces(
         hovertemplate=(
+            "<b>%{label}</b><br>"
+            "Size: %{customdata[0]}<br>"
+            "# of Chunks: %{customdata[1]}"
         )
     )
     fig.update_traces(root_color="lightgrey")
     return fig
 def format_repo_size(r_size: int) -> str:
+    """
+    Convert a repository size in bytes to a human-readable string with appropriate units.
+    Args:
+        r_size (int): The size of the repository in bytes.
+    Returns:
+        str: The formatted size string with appropriate units (B, KB, MB, GB, TB, PB).
+    """
     units = {0: "B", 1: "KB", 2: "MB", 3: "GB", 4: "TB", 5: "PB"}
     order = 0
     while r_size >= 1024 and order < len(units) - 1: