wrapping
Browse files
app.py
CHANGED
|
@@ -17,8 +17,10 @@ def apply_power_scaling(sizes: list, exponent=0.2) -> list:
|
|
| 17 |
return [size**exponent if size is not None else 0 for size in sizes]
|
| 18 |
|
| 19 |
|
| 20 |
-
def count_chunks(sizes: list) -> list:
|
| 21 |
"""Count the number of chunks, which are 64KB each in size; always roundup"""
|
|
|
|
|
|
|
| 22 |
return [int(np.ceil(size / 64_000)) if size is not None else 0 for size in sizes]
|
| 23 |
|
| 24 |
|
|
@@ -99,7 +101,7 @@ def flatten_hierarchy(hierarchy, root_name="Repository"):
|
|
| 99 |
return labels, parents, sizes, ids
|
| 100 |
|
| 101 |
|
| 102 |
-
def visualize_repo_treemap(r_info):
|
| 103 |
"""Visualizes the repository as a treemap with directory sizes and human-readable tooltips."""
|
| 104 |
siblings = r_info.siblings
|
| 105 |
hierarchy = build_hierarchy(siblings)
|
|
@@ -108,7 +110,7 @@ def visualize_repo_treemap(r_info):
|
|
| 108 |
calculate_directory_sizes(hierarchy)
|
| 109 |
|
| 110 |
# Flatten the hierarchy for Plotly
|
| 111 |
-
labels, parents, sizes, ids = flatten_hierarchy(hierarchy)
|
| 112 |
|
| 113 |
# Scale for vix
|
| 114 |
scaled_sizes = apply_power_scaling(sizes)
|
|
@@ -138,7 +140,7 @@ def visualize_repo_treemap(r_info):
|
|
| 138 |
values=scaled_sizes,
|
| 139 |
color=normalized_colors,
|
| 140 |
color_continuous_scale=colorscale,
|
| 141 |
-
title="
|
| 142 |
custom_data=[formatted_sizes, chunks],
|
| 143 |
height=1000,
|
| 144 |
ids=ids,
|
|
@@ -149,7 +151,7 @@ def visualize_repo_treemap(r_info):
|
|
| 149 |
# Add subtitle by updating the layout
|
| 150 |
fig.update_layout(
|
| 151 |
title={
|
| 152 |
-
"text": "
|
| 153 |
"x": 0.5,
|
| 154 |
"xanchor": "center",
|
| 155 |
},
|
|
@@ -189,16 +191,18 @@ def format_repo_size(r_size: int) -> str:
|
|
| 189 |
|
| 190 |
def repo_files(r_type: str, r_id: str) -> dict:
|
| 191 |
r_info = HF_API.repo_info(repo_id=r_id, repo_type=r_type, files_metadata=True)
|
| 192 |
-
fig = visualize_repo_treemap(r_info)
|
| 193 |
files = {}
|
| 194 |
for sibling in r_info.siblings:
|
| 195 |
ext = sibling.rfilename.split(".")[-1]
|
| 196 |
if ext in files:
|
| 197 |
files[ext]["size"] += sibling.size
|
|
|
|
| 198 |
files[ext]["count"] += 1
|
| 199 |
else:
|
| 200 |
files[ext] = {}
|
| 201 |
files[ext]["size"] = sibling.size
|
|
|
|
| 202 |
files[ext]["count"] = 1
|
| 203 |
return files, fig
|
| 204 |
|
|
@@ -226,7 +230,11 @@ def repo_size(r_type, r_id):
|
|
| 226 |
return {}
|
| 227 |
size = response.get("size")
|
| 228 |
if size is not None:
|
| 229 |
-
repo_sizes[branch.name] =
|
|
|
|
|
|
|
|
|
|
|
|
|
| 230 |
return repo_sizes
|
| 231 |
|
| 232 |
|
|
@@ -246,40 +254,32 @@ def get_repo_info(r_type, r_id):
|
|
| 246 |
gr.Dataframe(visible=False),
|
| 247 |
)
|
| 248 |
|
| 249 |
-
rf_sizes_df = (
|
| 250 |
-
pd.DataFrame(repo_files_info)
|
| 251 |
-
.T.reset_index(names="ext")
|
| 252 |
-
.sort_values(by="size", ascending=False)
|
| 253 |
-
)
|
| 254 |
# check if repo_sizes is just {}
|
| 255 |
if not repo_sizes:
|
| 256 |
r_sizes_component = gr.Dataframe(visible=False)
|
| 257 |
b_block = gr.Row(visible=False)
|
| 258 |
else:
|
| 259 |
-
r_sizes_df = pd.DataFrame(repo_sizes
|
| 260 |
-
|
|
|
|
| 261 |
)
|
| 262 |
-
r_sizes_df["
|
| 263 |
-
r_sizes_df.columns = ["Branch", "bytes", "Size"]
|
| 264 |
r_sizes_component = gr.Dataframe(
|
| 265 |
-
value=r_sizes_df[["Branch", "Size"]], visible=True
|
| 266 |
)
|
| 267 |
b_block = gr.Row(visible=True)
|
| 268 |
|
| 269 |
-
rf_sizes_df
|
| 270 |
-
|
| 271 |
-
|
| 272 |
-
|
| 273 |
-
values="bytes",
|
| 274 |
-
names="Extension",
|
| 275 |
-
hover_data=["Size"],
|
| 276 |
-
title=f"File Distribution in {r_id}",
|
| 277 |
-
hole=0.3,
|
| 278 |
)
|
|
|
|
|
|
|
| 279 |
return (
|
| 280 |
gr.Row(visible=True),
|
| 281 |
gr.Dataframe(
|
| 282 |
-
value=rf_sizes_df[["Extension", "Count", "Size"]],
|
| 283 |
visible=True,
|
| 284 |
),
|
| 285 |
# gr.Plot(rf_sizes_plot, visible=True),
|
|
@@ -290,9 +290,9 @@ def get_repo_info(r_type, r_id):
|
|
| 290 |
|
| 291 |
|
| 292 |
with gr.Blocks(theme="ocean") as demo:
|
| 293 |
-
gr.Markdown("#
|
| 294 |
gr.Markdown(
|
| 295 |
-
"Search for a model or dataset repository using the autocomplete below, select the repository type, and get back information about the repository's
|
| 296 |
)
|
| 297 |
with gr.Blocks():
|
| 298 |
# repo_id = gr.Textbox(label="Repository ID", placeholder="123456")
|
|
@@ -310,15 +310,23 @@ with gr.Blocks(theme="ocean") as demo:
|
|
| 310 |
with gr.Blocks():
|
| 311 |
with gr.Row(visible=False) as results_block:
|
| 312 |
with gr.Column():
|
| 313 |
-
gr.Markdown("##
|
| 314 |
file_info_plot = gr.Plot(visible=False)
|
| 315 |
-
with gr.Row():
|
| 316 |
-
file_info = gr.Dataframe(visible=False)
|
| 317 |
-
# file_info_plot = gr.Plot(visible=False)
|
| 318 |
with gr.Row(visible=False) as branch_block:
|
| 319 |
with gr.Column():
|
| 320 |
-
gr.Markdown("
|
|
|
|
|
|
|
|
|
|
| 321 |
branch_sizes = gr.Dataframe(visible=False)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 322 |
|
| 323 |
search_button.click(
|
| 324 |
get_repo_info,
|
|
|
|
| 17 |
return [size**exponent if size is not None else 0 for size in sizes]
|
| 18 |
|
| 19 |
|
| 20 |
+
def count_chunks(sizes: list | int) -> list:
|
| 21 |
"""Count the number of chunks, which are 64KB each in size; always roundup"""
|
| 22 |
+
if isinstance(sizes, int):
|
| 23 |
+
return int(np.ceil(sizes / 64_000))
|
| 24 |
return [int(np.ceil(size / 64_000)) if size is not None else 0 for size in sizes]
|
| 25 |
|
| 26 |
|
|
|
|
| 101 |
return labels, parents, sizes, ids
|
| 102 |
|
| 103 |
|
| 104 |
+
def visualize_repo_treemap(r_info: dict, r_id: str) -> px.treemap:
|
| 105 |
"""Visualizes the repository as a treemap with directory sizes and human-readable tooltips."""
|
| 106 |
siblings = r_info.siblings
|
| 107 |
hierarchy = build_hierarchy(siblings)
|
|
|
|
| 110 |
calculate_directory_sizes(hierarchy)
|
| 111 |
|
| 112 |
# Flatten the hierarchy for Plotly
|
| 113 |
+
labels, parents, sizes, ids = flatten_hierarchy(hierarchy, r_id)
|
| 114 |
|
| 115 |
# Scale for vix
|
| 116 |
scaled_sizes = apply_power_scaling(sizes)
|
|
|
|
| 140 |
values=scaled_sizes,
|
| 141 |
color=normalized_colors,
|
| 142 |
color_continuous_scale=colorscale,
|
| 143 |
+
title=f"{r_id} by Chunks",
|
| 144 |
custom_data=[formatted_sizes, chunks],
|
| 145 |
height=1000,
|
| 146 |
ids=ids,
|
|
|
|
| 151 |
# Add subtitle by updating the layout
|
| 152 |
fig.update_layout(
|
| 153 |
title={
|
| 154 |
+
"text": f"{r_id} file and chunk treemap<br><span style='font-size:14px;'>Hover over each directory/file to see its size and number of chunks it contains.</span>",
|
| 155 |
"x": 0.5,
|
| 156 |
"xanchor": "center",
|
| 157 |
},
|
|
|
|
| 191 |
|
| 192 |
def repo_files(r_type: str, r_id: str) -> dict:
|
| 193 |
r_info = HF_API.repo_info(repo_id=r_id, repo_type=r_type, files_metadata=True)
|
| 194 |
+
fig = visualize_repo_treemap(r_info, r_id)
|
| 195 |
files = {}
|
| 196 |
for sibling in r_info.siblings:
|
| 197 |
ext = sibling.rfilename.split(".")[-1]
|
| 198 |
if ext in files:
|
| 199 |
files[ext]["size"] += sibling.size
|
| 200 |
+
files[ext]["chunks"] += count_chunks(sibling.size)
|
| 201 |
files[ext]["count"] += 1
|
| 202 |
else:
|
| 203 |
files[ext] = {}
|
| 204 |
files[ext]["size"] = sibling.size
|
| 205 |
+
files[ext]["chunks"] = count_chunks(sibling.size)
|
| 206 |
files[ext]["count"] = 1
|
| 207 |
return files, fig
|
| 208 |
|
|
|
|
| 230 |
return {}
|
| 231 |
size = response.get("size")
|
| 232 |
if size is not None:
|
| 233 |
+
repo_sizes[branch.name] = {
|
| 234 |
+
"size_in_bytes": size,
|
| 235 |
+
"size_in_chunks": count_chunks(size),
|
| 236 |
+
}
|
| 237 |
+
|
| 238 |
return repo_sizes
|
| 239 |
|
| 240 |
|
|
|
|
| 254 |
gr.Dataframe(visible=False),
|
| 255 |
)
|
| 256 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 257 |
# check if repo_sizes is just {}
|
| 258 |
if not repo_sizes:
|
| 259 |
r_sizes_component = gr.Dataframe(visible=False)
|
| 260 |
b_block = gr.Row(visible=False)
|
| 261 |
else:
|
| 262 |
+
r_sizes_df = pd.DataFrame(repo_sizes).T.reset_index(names="branch")
|
| 263 |
+
r_sizes_df["formatted_size"] = r_sizes_df["size_in_bytes"].apply(
|
| 264 |
+
format_repo_size
|
| 265 |
)
|
| 266 |
+
r_sizes_df.columns = ["Branch", "size_in_bytes", "Chunks", "Size"]
|
|
|
|
| 267 |
r_sizes_component = gr.Dataframe(
|
| 268 |
+
value=r_sizes_df[["Branch", "Size", "Chunks"]], visible=True
|
| 269 |
)
|
| 270 |
b_block = gr.Row(visible=True)
|
| 271 |
|
| 272 |
+
rf_sizes_df = (
|
| 273 |
+
pd.DataFrame(repo_files_info)
|
| 274 |
+
.T.reset_index(names="ext")
|
| 275 |
+
.sort_values(by="size", ascending=False)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 276 |
)
|
| 277 |
+
rf_sizes_df["formatted_size"] = rf_sizes_df["size"].apply(format_repo_size)
|
| 278 |
+
rf_sizes_df.columns = ["Extension", "bytes", "Chunks", "Count", "Size"]
|
| 279 |
return (
|
| 280 |
gr.Row(visible=True),
|
| 281 |
gr.Dataframe(
|
| 282 |
+
value=rf_sizes_df[["Extension", "Count", "Size", "Chunks"]],
|
| 283 |
visible=True,
|
| 284 |
),
|
| 285 |
# gr.Plot(rf_sizes_plot, visible=True),
|
|
|
|
| 290 |
|
| 291 |
|
| 292 |
with gr.Blocks(theme="ocean") as demo:
|
| 293 |
+
gr.Markdown("# Chunking Repos")
|
| 294 |
gr.Markdown(
|
| 295 |
+
"Search for a model or dataset repository using the autocomplete below, select the repository type, and get back information about the repository's contents including the [number of chunks each file might be split into with Xet backed storage](https://huggingface.co/blog/from-files-to-chunks)."
|
| 296 |
)
|
| 297 |
with gr.Blocks():
|
| 298 |
# repo_id = gr.Textbox(label="Repository ID", placeholder="123456")
|
|
|
|
| 310 |
with gr.Blocks():
|
| 311 |
with gr.Row(visible=False) as results_block:
|
| 312 |
with gr.Column():
|
| 313 |
+
gr.Markdown("## Repo Info")
|
| 314 |
file_info_plot = gr.Plot(visible=False)
|
|
|
|
|
|
|
|
|
|
| 315 |
with gr.Row(visible=False) as branch_block:
|
| 316 |
with gr.Column():
|
| 317 |
+
gr.Markdown("### Branch Sizes")
|
| 318 |
+
gr.Markdown(
|
| 319 |
+
"The size of each branch in the repository and how many chunks it might need (assuming no dedupe)."
|
| 320 |
+
)
|
| 321 |
branch_sizes = gr.Dataframe(visible=False)
|
| 322 |
+
with gr.Row():
|
| 323 |
+
with gr.Column():
|
| 324 |
+
gr.Markdown("### File Sizes")
|
| 325 |
+
gr.Markdown(
|
| 326 |
+
"The cumulative size of each filetype in the repository (in the `main` branch) and how many chunks they might need (assuming no dedupe)."
|
| 327 |
+
)
|
| 328 |
+
file_info = gr.Dataframe(visible=False)
|
| 329 |
+
# file_info_plot = gr.Plot(visible=False)
|
| 330 |
|
| 331 |
search_button.click(
|
| 332 |
get_repo_info,
|