Skip to content

Commit 098f66b

Browse files
committed
Extended logs
1 parent 039bf87 commit 098f66b

1 file changed

Lines changed: 37 additions & 18 deletions

File tree

dvuploader/dvuploader.py

Lines changed: 37 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
import requests
33
import json
44
import os
5-
from typing import List
5+
from typing import Dict, List
66
from urllib.parse import urljoin
77

88
from pydantic import BaseModel
@@ -33,6 +33,7 @@ def upload(
3333
persistent_id: str,
3434
dataverse_url: str,
3535
api_token: str,
36+
n_jobs: int = -1,
3637
) -> None:
3738
"""
3839
Uploads the files to the specified Dataverse repository in parallel.
@@ -41,7 +42,8 @@ def upload(
4142
persistent_id (str): The persistent identifier of the Dataverse dataset.
4243
dataverse_url (str): The URL of the Dataverse repository.
4344
api_token (str): The API token for the Dataverse repository.
44-
45+
n_jobs (int): The number of parallel jobs to run. Defaults to -1.
46+
4547
Returns:
4648
None
4749
"""
@@ -59,13 +61,13 @@ def upload(
5961
)
6062

6163
if not self.files:
62-
print("❌ No files to upload")
64+
print("\n❌ No files to upload")
6365
return
6466

6567
# Upload files in parallel
6668
print(f"\n🚀 Uploading files")
6769

68-
Parallel(n_jobs=-1, backend="threading")(
70+
Parallel(n_jobs=n_jobs, backend="threading")(
6971
delayed(direct_upload)(
7072
file=file,
7173
dataverse_url=dataverse_url,
@@ -101,23 +103,40 @@ def _check_duplicates(
101103
api_token=api_token,
102104
)
103105

104-
print("🔎 Checking dataset files")
105-
106-
for file in ds_files:
107-
hash_algo, hash_value = tuple(file.dataFile.checksum.values())
108-
fname = file.dataFile.filename
109-
110-
same_hash = lambda file: (
111-
file.checksum.value == hash_value and file.checksum.type == hash_algo
112-
)
113-
114-
if any(map(same_hash, self.files)):
115-
del self.files[self.files.index(next(filter(same_hash, self.files)))]
106+
print("\n🔎 Checking dataset files")
107+
108+
to_remove = []
109+
110+
for file in self.files:
111+
if any(map(lambda dsFile: self._check_hashes(file, dsFile), ds_files)):
116112
print(
117-
f"├── File '{fname}' already exists with same {hash_algo} hash - Skipping upload."
113+
f"├── File '{file.fileName}' already exists with same {file.checksum.type} hash - Skipping upload."
118114
)
115+
to_remove.append(file)
116+
else:
117+
print(f"├── File '{file.fileName}' is new - Uploading.")
118+
119+
for file in to_remove:
120+
self.files.remove(file)
121+
122+
print("🎉 Done")
123+
124+
@staticmethod
125+
def _check_hashes(file: File, dsFile: Dict):
126+
"""
127+
Checks if a file has the same checksum as a file in the dataset.
128+
129+
Parameters:
130+
file (File): The file to check.
131+
dsFile (Dict): The file in the dataset to compare to.
132+
133+
Returns:
134+
bool: True if the files have the same checksum, False otherwise.
135+
"""
136+
137+
hash_algo, hash_value = tuple(dsFile.dataFile.checksum.values())
119138

120-
print("🎉 Done\n")
139+
return file.checksum.value == hash_value and file.checksum.type == hash_algo
121140

122141
@staticmethod
123142
def _retrieve_dataset_files(

0 commit comments

Comments
 (0)