Spaces:
Running
on
Zero
Running
on
Zero
Update iterate_data (#81)
Browse filesSummary:
Test Plan:
- bytelatent/iterate_data.py +15 -6
bytelatent/iterate_data.py
CHANGED
|
@@ -8,10 +8,16 @@ from bytelatent.data.iterators.multiprocess_iterator import MultiprocessIterator
|
|
| 8 |
from bytelatent.logger import init_logger
|
| 9 |
|
| 10 |
|
| 11 |
-
def main(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 12 |
init_logger()
|
| 13 |
-
pyarrow.set_io_thread_count(
|
| 14 |
-
pyarrow.set_cpu_count(
|
| 15 |
with open(state_file) as f:
|
| 16 |
train_state = json.load(f)
|
| 17 |
dl_state = MultiprocessIteratorState(**train_state["data_loader_state"])
|
|
@@ -20,10 +26,13 @@ def main(state_file: str):
|
|
| 20 |
packing_iterator = packing_iterator_state.build()
|
| 21 |
print("iter")
|
| 22 |
batch_iter = packing_iterator.create_iter()
|
| 23 |
-
batch = None
|
| 24 |
print("looping")
|
| 25 |
-
for i in track(range(
|
| 26 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 27 |
|
| 28 |
|
| 29 |
if __name__ == "__main__":
|
|
|
|
| 8 |
from bytelatent.logger import init_logger
|
| 9 |
|
| 10 |
|
| 11 |
+
def main(
|
| 12 |
+
state_file: str,
|
| 13 |
+
steps: int = 3_000,
|
| 14 |
+
io_thread_count: int = 2,
|
| 15 |
+
cpu_count: int = 2,
|
| 16 |
+
log_freq: int = 100,
|
| 17 |
+
):
|
| 18 |
init_logger()
|
| 19 |
+
pyarrow.set_io_thread_count(io_thread_count)
|
| 20 |
+
pyarrow.set_cpu_count(cpu_count)
|
| 21 |
with open(state_file) as f:
|
| 22 |
train_state = json.load(f)
|
| 23 |
dl_state = MultiprocessIteratorState(**train_state["data_loader_state"])
|
|
|
|
| 26 |
packing_iterator = packing_iterator_state.build()
|
| 27 |
print("iter")
|
| 28 |
batch_iter = packing_iterator.create_iter()
|
|
|
|
| 29 |
print("looping")
|
| 30 |
+
for i in track(range(steps)):
|
| 31 |
+
_ = next(batch_iter)
|
| 32 |
+
if i % log_freq == 0:
|
| 33 |
+
print(pyarrow.default_memory_pool())
|
| 34 |
+
print(i)
|
| 35 |
+
print(pyarrow.default_memory_pool())
|
| 36 |
|
| 37 |
|
| 38 |
if __name__ == "__main__":
|