22import requests
33import json
44import os
5- from typing import List
5+ from typing import Dict , List
66from urllib .parse import urljoin
77
88from pydantic import BaseModel
@@ -33,6 +33,7 @@ def upload(
3333 persistent_id : str ,
3434 dataverse_url : str ,
3535 api_token : str ,
36+ n_jobs : int = - 1 ,
3637 ) -> None :
3738 """
3839 Uploads the files to the specified Dataverse repository in parallel.
@@ -41,7 +42,8 @@ def upload(
4142 persistent_id (str): The persistent identifier of the Dataverse dataset.
4243 dataverse_url (str): The URL of the Dataverse repository.
4344 api_token (str): The API token for the Dataverse repository.
44-
45+ n_jobs (int): The number of parallel jobs to run. Defaults to -1.
46+
4547 Returns:
4648 None
4749 """
@@ -59,13 +61,13 @@ def upload(
5961 )
6062
6163 if not self .files :
62- print ("❌ No files to upload" )
64+ print ("\n ❌ No files to upload" )
6365 return
6466
6567 # Upload files in parallel
6668 print (f"\n 🚀 Uploading files" )
6769
68- Parallel (n_jobs = - 1 , backend = "threading" )(
70+ Parallel (n_jobs = n_jobs , backend = "threading" )(
6971 delayed (direct_upload )(
7072 file = file ,
7173 dataverse_url = dataverse_url ,
@@ -101,23 +103,40 @@ def _check_duplicates(
101103 api_token = api_token ,
102104 )
103105
104- print ("🔎 Checking dataset files" )
105-
106- for file in ds_files :
107- hash_algo , hash_value = tuple (file .dataFile .checksum .values ())
108- fname = file .dataFile .filename
109-
110- same_hash = lambda file : (
111- file .checksum .value == hash_value and file .checksum .type == hash_algo
112- )
113-
114- if any (map (same_hash , self .files )):
115- del self .files [self .files .index (next (filter (same_hash , self .files )))]
106+ print ("\n 🔎 Checking dataset files" )
107+
108+ to_remove = []
109+
110+ for file in self .files :
111+ if any (map (lambda dsFile : self ._check_hashes (file , dsFile ), ds_files )):
116112 print (
117- f"├── File '{ fname } ' already exists with same { hash_algo } hash - Skipping upload."
113+ f"├── File '{ file . fileName } ' already exists with same { file . checksum . type } hash - Skipping upload."
118114 )
115+ to_remove .append (file )
116+ else :
117+ print (f"├── File '{ file .fileName } ' is new - Uploading." )
118+
119+ for file in to_remove :
120+ self .files .remove (file )
121+
122+ print ("🎉 Done" )
123+
124+ @staticmethod
125+ def _check_hashes (file : File , dsFile : Dict ):
126+ """
127+ Checks if a file has the same checksum as a file in the dataset.
128+
129+ Parameters:
130+ file (File): The file to check.
131+ dsFile (Dict): The file in the dataset to compare to.
132+
133+ Returns:
134+ bool: True if the files have the same checksum, False otherwise.
135+ """
136+
137+ hash_algo , hash_value = tuple (dsFile .dataFile .checksum .values ())
119138
120- print ( "🎉 Done \n " )
139+ return file . checksum . value == hash_value and file . checksum . type == hash_algo
121140
122141 @staticmethod
123142 def _retrieve_dataset_files (
0 commit comments