Skip to content

Commit b629b7b

Browse files
authored
Improve memory usage when reading dataset
Parse the JSON after splitting the dataset. This improves memory usage because the deserialized objects aren't stored in memory all at once.
1 parent af04b4c commit b629b7b

1 file changed

Lines changed: 5 additions & 9 deletions

File tree

Python150kExtractor/extract.py

Lines changed: 5 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -25,13 +25,9 @@
2525

2626

2727
def __collect_asts(json_file):
28-
asts = []
2928
with open(json_file, 'r', encoding='utf-8') as f:
30-
for line in f:
31-
ast = json.loads(line.strip())
32-
asts.append(ast)
33-
34-
return asts
29+
for line in tqdm.tqdm(f):
30+
yield line
3531

3632

3733
def __terminals(ast, node_index, args):
@@ -170,8 +166,8 @@ def main():
170166
np.random.seed(args.seed)
171167

172168
data_dir = Path(args.data_dir)
173-
trains = __collect_asts(data_dir / 'python100k_train.json')
174-
evals = __collect_asts(data_dir / 'python50k_eval.json')
169+
trains = list(__collect_asts(data_dir / 'python100k_train.json'))
170+
evals = list(__collect_asts(data_dir / 'python50k_eval.json'))
175171

176172
train, valid = sklearn_model_selection.train_test_split(
177173
trains,
@@ -186,7 +182,7 @@ def main():
186182
(train, valid, test),
187183
):
188184
output_file = output_dir / f'{split_name}_output_file.txt'
189-
__collect_all_and_save(split, args, output_file)
185+
__collect_all_and_save((json.loads(line) for line in split), args, output_file)
190186

191187

192188
if __name__ == '__main__':

0 commit comments

Comments
 (0)