Spaces:
Build error
Build error
| import streamlit as st | |
| import numpy as np | |
| import pandas as pd | |
| import datasets | |
| from dataclasses import asdict | |
| from PIL import Image | |
| import yaml | |
| import textwrap | |
| import tornado | |
| import json | |
| import time | |
| import sys | |
| from git import Repo | |
| import os | |
| if not os.path.exists('datasets_clone'): | |
| Repo.clone_from('https://github.com/huggingface/datasets.git', 'datasets_clone') | |
| MAX_SIZE = 40000000000 | |
| # if len(sys.argv) > 1: | |
| # path_to_datasets = sys.argv[1] | |
| # else: | |
| # path_to_datasets = None | |
| path_to_datasets = 'datasets_clone/datasets/' | |
| ## Hack to extend the width of the main pane. | |
| def _max_width_(): | |
| max_width_str = f"max-width: 1000px;" | |
| st.markdown( | |
| f""" | |
| <style> | |
| .reportview-container .main .block-container{{ | |
| {max_width_str} | |
| }} | |
| th {{ | |
| text-align: left; | |
| font-size: 110%; | |
| }} | |
| tr:hover {{ | |
| background-color: #ffff99; | |
| }} | |
| </style> | |
| """, | |
| unsafe_allow_html=True, | |
| ) | |
| _max_width_() | |
| def render_features(features): | |
| if isinstance(features, dict): | |
| return {k: render_features(v) for k, v in features.items()} | |
| if isinstance(features, datasets.features.ClassLabel): | |
| return features.names | |
| if isinstance(features, datasets.features.Value): | |
| return features.dtype | |
| if isinstance(features, datasets.features.Sequence): | |
| return {"[]": render_features(features.feature)} | |
| return features | |
| app_state = st.experimental_get_query_params() | |
| # print(app_state) | |
| start = True | |
| loaded = True | |
| INITIAL_SELECTION = "" | |
| # if app_state == "NOT_INITIALIZED": | |
| # latest_iteration = st.empty() | |
| # bar = st.progress(0) | |
| # start = False | |
| # for i in range(0, 101, 10): | |
| # # Update the progress bar with each iteration. | |
| # # latest_iteration.text(f'Iteration {i+1}') | |
| # bar.progress(i) | |
| # time.sleep(0.1) | |
| # if i == 100: | |
| # start = True | |
| # bar.empty() | |
| # loaded = True | |
| # app_state = st.experimental_get_query_params() | |
| # print("appstate is", app_state) | |
| app_state.setdefault("dataset", "glue") | |
| if len(app_state.get("dataset", [])) == 1: | |
| app_state["dataset"] = app_state["dataset"][0] | |
| INITIAL_SELECTION = app_state["dataset"] | |
| if len(app_state.get("config", [])) == 1: | |
| app_state["config"] = app_state["config"][0] | |
| print(INITIAL_SELECTION) | |
| if start: | |
| ## Logo and sidebar decoration. | |
| st.sidebar.markdown( | |
| """<center> | |
| <a href="https://github.com/huggingface/datasets"> | |
| </a> | |
| </center>""", | |
| unsafe_allow_html=True, | |
| ) | |
| st.sidebar.image("datasets_logo_name.png", width=300) | |
| st.sidebar.markdown( | |
| "<center><h2><a href='https://github.com/huggingface/datasets'>github/huggingface/datasets</h2></a></center>", | |
| unsafe_allow_html=True, | |
| ) | |
| st.sidebar.markdown( | |
| """ | |
| <center> | |
| <a target="_blank" href="https://huggingface.co/docs/datasets/">Docs</a> | | |
| <a target="_blank" href="https://huggingface.co/datasets">Browse</a> | |
| | <a href="https://huggingface.co/new-dataset" target="_blank">Add Dataset</a> | |
| </center>""", | |
| unsafe_allow_html=True, | |
| ) | |
| st.sidebar.subheader("") | |
| ## Interaction with the datasets libary. | |
| # @st.cache | |
| def get_confs(opt): | |
| "Get the list of confs for a dataset." | |
| if path_to_datasets is not None and opt is not None: | |
| path = path_to_datasets + opt | |
| else: | |
| path = opt | |
| module_path = datasets.load.prepare_module(path, dataset=True | |
| ) | |
| # Get dataset builder class from the processing script | |
| builder_cls = datasets.load.import_main_class(module_path[0], dataset=True) | |
| # Instantiate the dataset builder | |
| confs = builder_cls.BUILDER_CONFIGS | |
| if confs and len(confs) > 1: | |
| return confs | |
| else: | |
| return [] | |
| # @st.cache(allow_output_mutation=True) | |
| def get(opt, conf=None): | |
| "Get a dataset from name and conf" | |
| if path_to_datasets is not None: | |
| path = path_to_datasets + opt | |
| else: | |
| path = opt | |
| module_path = datasets.load.prepare_module(path, dataset=True) | |
| builder_cls = datasets.load.import_main_class(module_path[0], dataset=True) | |
| if conf: | |
| builder_instance = builder_cls(name=conf, cache_dir=path if path_to_datasets is not None else None) | |
| else: | |
| builder_instance = builder_cls(cache_dir=path if path_to_datasets is not None else None) | |
| fail = False | |
| if path_to_datasets is not None: | |
| dts = datasets.load_dataset(path, | |
| name=builder_cls.BUILDER_CONFIGS[0].name if builder_cls.BUILDER_CONFIGS else None, | |
| ) | |
| dataset = dts | |
| elif ( | |
| builder_instance.manual_download_instructions is None | |
| and builder_instance.info.size_in_bytes is not None | |
| and builder_instance.info.size_in_bytes < MAX_SIZE): | |
| builder_instance.download_and_prepare() | |
| dts = builder_instance.as_dataset() | |
| dataset = dts | |
| else: | |
| dataset = builder_instance | |
| fail = True | |
| return dataset, fail | |
| # Dataset select box. | |
| dataset_names = [] | |
| selection = None | |
| import glob | |
| if path_to_datasets is None: | |
| list_of_datasets = datasets.list_datasets(with_community_datasets=False) | |
| else: | |
| list_of_datasets = sorted(glob.glob(path_to_datasets + "*")) | |
| print(list_of_datasets) | |
| for i, dataset in enumerate(list_of_datasets): | |
| dataset = dataset.split("/")[-1] | |
| if INITIAL_SELECTION and dataset == INITIAL_SELECTION: | |
| selection = i | |
| dataset_names.append(dataset ) | |
| if selection is not None: | |
| option = st.sidebar.selectbox( | |
| "Dataset", dataset_names, index=selection, format_func=lambda a: a | |
| ) | |
| else: | |
| option = st.sidebar.selectbox("Dataset", dataset_names, format_func=lambda a: a) | |
| print(option) | |
| app_state["dataset"] = option | |
| st.experimental_set_query_params(**app_state) | |
| # Side bar Configurations. | |
| configs = get_confs(option) | |
| conf_avail = len(configs) > 0 | |
| conf_option = None | |
| if conf_avail: | |
| start = 0 | |
| for i, conf in enumerate(configs): | |
| if conf.name == app_state.get("config", None): | |
| start = i | |
| conf_option = st.sidebar.selectbox( | |
| "Subset", configs, index=start, format_func=lambda a: a.name | |
| ) | |
| app_state["config"] = conf_option.name | |
| else: | |
| if "config" in app_state: | |
| del app_state["config"] | |
| st.experimental_set_query_params(**app_state) | |
| dts, fail = get(str(option), str(conf_option.name) if conf_option else None) | |
| # Main panel setup. | |
| if fail: | |
| st.markdown( | |
| "Dataset is too large to browse or requires manual download. Check it out in the datasets library! \n\n Size: " | |
| + str(dts.info.size_in_bytes) | |
| + "\n\n Instructions: " | |
| + str(dts.manual_download_instructions) | |
| ) | |
| else: | |
| k = list(dts.keys()) | |
| index = 0 | |
| if "train" in dts.keys(): | |
| index = k.index("train") | |
| split = st.sidebar.selectbox("Split", k, index=index) | |
| d = dts[split] | |
| keys = list(d[0].keys()) | |
| st.header( | |
| "Dataset: " | |
| + option | |
| + " " | |
| + (("/ " + conf_option.name) if conf_option else "") | |
| ) | |
| st.markdown( | |
| "*Homepage*: " | |
| + d.info.homepage | |
| + "\n\n*Dataset*: https://huggingface.co/datasets/%s" | |
| % (option) | |
| ) | |
| md = """ | |
| %s | |
| """ % ( | |
| d.info.description.replace("\\", "") if option else "" | |
| ) | |
| st.markdown(md) | |
| step = 50 | |
| offset = st.sidebar.number_input( | |
| "Offset (Size: %d)" % len(d), | |
| min_value=0, | |
| max_value=int(len(d)) - step, | |
| value=0, | |
| step=step, | |
| ) | |
| image_classification, gallary = False, False | |
| if d.info.task_templates: | |
| for task_template in d.info.task_templates: | |
| if task_template.task == 'image-classification': | |
| image_classification = True | |
| st.sidebar.markdown('\n---\n') | |
| gallary = st.sidebar.checkbox("Show Image Gallary 🖼️", False) if image_classification else None | |
| break | |
| citation = st.sidebar.checkbox("Show Citations 📎", False) | |
| table = image_classification or not st.sidebar.checkbox("Show List View 📋", False) | |
| show_features = st.sidebar.checkbox("Show Features 🧐", True) | |
| md = """ | |
| ``` | |
| %s | |
| ``` | |
| """ % ( | |
| d.info.citation.replace("\\", "").replace("}", " }").replace("{", "{ "), | |
| ) | |
| if citation: | |
| st.markdown(md) | |
| # st.text("Features:") | |
| if show_features: | |
| if not gallary: | |
| on_keys = st.multiselect("Features", keys, keys) | |
| st.write(render_features(d.features)) | |
| else: | |
| on_keys = keys | |
| if not table and not (image_classification and gallary): | |
| # Full view. | |
| for item in range(offset, offset + step): | |
| st.text(" ") | |
| st.text(" ---- #" + str(item)) | |
| st.text(" ") | |
| # Use st to write out. | |
| for k in on_keys: | |
| v = d[item][k] | |
| st.subheader(k) | |
| if isinstance(v, str): | |
| out = v | |
| st.text(textwrap.fill(out, width=120)) | |
| elif ( | |
| isinstance(v, bool) | |
| or isinstance(v, int) | |
| or isinstance(v, float) | |
| ): | |
| st.text(v) | |
| else: | |
| st.write(v) | |
| elif image_classification and gallary: | |
| # Image Gallary View. | |
| d = d.prepare_for_task('image-classification') | |
| n_cols, n_rows = 5, 10 | |
| images = [] | |
| labels = [] | |
| for item in range(offset, offset+step): | |
| image = Image.open(d[item]['image_file_path']).convert("RGB") | |
| images.append(image) | |
| label_id = d[item]['labels'] | |
| label_str = d.features['labels'].int2str(label_id) | |
| labels.append(f"#{item} | {label_str}") | |
| n_rows = 1 + len(images) // int(n_cols) | |
| cols_per_row = [st.beta_columns(n_cols) for _ in range(n_rows)] | |
| cols = [column for row in cols_per_row for column in row] | |
| for idx, (image, label) in enumerate(zip(images, labels)): | |
| cols[idx].image(image, caption=label) | |
| else: | |
| # Table view. Use Pandas. | |
| df = [] | |
| for item in range(offset, offset + step): | |
| df_item = {} | |
| df_item["_number"] = item | |
| for k in on_keys: | |
| v = d[item][k] | |
| if isinstance(v, str): | |
| out = v | |
| df_item[k] = textwrap.fill(out, width=50) | |
| elif ( | |
| isinstance(v, bool) | |
| or isinstance(v, int) | |
| or isinstance(v, float) | |
| ): | |
| df_item[k] = v | |
| else: | |
| out = json.dumps(v, indent=2, sort_keys=True) | |
| df_item[k] = out | |
| df.append(df_item) | |
| df2 = df | |
| df = pd.DataFrame(df).set_index("_number") | |
| def hover(hover_color="#ffff99"): | |
| return dict( | |
| selector="tr:hover", | |
| props=[("background-color", "%s" % hover_color)], | |
| ) | |
| styles = [ | |
| hover(), | |
| dict( | |
| selector="th", | |
| props=[("font-size", "150%"), ("text-align", "center")], | |
| ), | |
| dict(selector="caption", props=[("caption-side", "bottom")]), | |
| ] | |
| # Table view. Use pands styling. | |
| style = df.style.set_properties( | |
| **{"text-align": "left", "white-space": "pre"} | |
| ).set_table_styles([dict(selector="th", props=[("text-align", "left")])]) | |
| style = style.set_table_styles(styles) | |
| st.table(style) | |
| # Additional dataset installation and sidebar properties. | |
| md = """ | |
| ### Code | |
| ```python | |
| !pip install datasets | |
| from datasets import load_dataset | |
| dataset = load_dataset( | |
| '%s'%s) | |
| ``` | |
| """ % ( | |
| option, | |
| (", '" + conf_option.name + "'") if conf_option else "", | |
| ) | |
| st.sidebar.markdown(md) | |