-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathteiParts2json.py
More file actions
412 lines (353 loc) · 16.7 KB
/
teiParts2json.py
File metadata and controls
412 lines (353 loc) · 16.7 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
#!/usr/bin/env python3
"""
tei2json.py
Usage:
# single file -> prints JSON to stdout
python tei2json.py input.xml
or if data directory is in same folder:
python britishLibrary/tei2json.py britishLibrary-data/data/tei/10.xml
# directory -> produce one JSON file per TEI named <basename>.json
python britishLibrary/tei2json.py britishLibrary-data/data/tei/10.xml --outdir json_output
# directory -> produce OpenSearch bulk file
python britishLibrary/tei2json.py britishLibrary-data/data/tei/10.xml --outdir json_output --bulk bulk_data.json
"""
from lxml import etree
import argparse
import json
import os
from pathlib import Path
from typing import List
NS = {"tei": "http://www.tei-c.org/ns/1.0"}
# ISO 639-3 language code mapping for scripts
SCRIPT_LANG_MAP = {
"syr": "Syriac",
"syr-Syre": "Syriac (Estrangela)",
"syr-Syrj": "Syriac (Western)",
"syr-Syrn": "Syriac (Eastern)",
"ar": "Arabic",
"grc": "Greek",
"he": "Hebrew",
"en": "English",
"la": "Latin",
"mul": "Multiple languages",
"cop": "Coptic",
"fr": "French",
"hy": "Armenian",
"zh-hant": "Chinese (Traditional)",
"hyr": "Armenian",
"qhy-x-cpas":"Classical Syriac (ܟܬܒܢܝܐ)",
"xcl": "Lycian",
"und": "Undetermined",
"syr-x-syrm": "Syriac (Melkite script)",
"ar-syr": "Arabic language written in Syriac script"
}
MATERIALS_MAP = {
"perg": "Parchment",
"chart": "Paper",
"mixed": "Mixed",
"unknown": "Unknown",
"Vellum": "Parchment"
}
def map_script_to_language(script_codes):
"""Convert script codes to language names"""
if not script_codes:
return None
codes = script_codes.split()
langs = []
for code in codes:
lang = SCRIPT_LANG_MAP.get(code.strip(), code.strip())
if lang not in langs:
langs.append(lang)
return ", ".join(langs) if langs else None
def text_list(root, xpath) -> List[str]:
"""Return trimmed text contents for nodes matched by xpath"""
nodes = root.xpath(xpath, namespaces=NS)
out = []
for n in nodes:
# If element node with mixed content, get all text recursively
if isinstance(n, etree._Element):
txt = ''.join(n.itertext()).strip()
else:
txt = (str(n) or "").strip()
if txt:
out.append(" ".join(txt.split()))
return out
def html_fragment(node):
"""Serialize an element's inner content, preserving inline markup (like <span>)"""
if node is None:
return ""
parts = []
for child in node.iterchildren():
parts.append(etree.tostring(child, encoding="unicode", method="html"))
# include text node before first child if present
if (node.text or "").strip():
parts.insert(0, node.text.strip())
return "".join(parts).strip()
def first_text(root, xpath):
lst = text_list(root, xpath)
return lst[0] if lst else None
def extract_json(tree, part_node=None):
"""Extract JSON from a TEI tree or a specific msPart node.
Args:
tree: The full TEI document tree
part_node: Optional msPart element to extract from (if None, extracts from root)
"""
root = tree.getroot() if part_node is None else part_node
# For msPart, we need to look at both the part and the parent document for some fields
doc_root = tree.getroot()
# Different title types
title_stmt = text_list(root, ".//tei:titleStmt/tei:title")
ms_item_titles = text_list(root, ".//tei:msItem//tei:title")
rubrics = text_list(root, ".//tei:rubric")
syr_titles = text_list(root, ".//tei:title[@xml:lang='syr'] | .//tei:rubric[@xml:lang='syr'] | .//tei:finalRubric[@xml:lang='syr']")
# id: try msIdentifier idno type=URI or publication idno or teiHeader/fileDesc/publicationStmt/idno
idno = first_text(root, ".//tei:msIdentifier/tei:idno[@type='URI'] | .//tei:publicationStmt/tei:idno[@type='URI'] | .//tei:msIdentifier/tei:idno")
# For msPart, get the part number from @n attribute
part_num = None
if part_node is not None and part_node.get('n'):
part_num = part_node.get('n')
# displayTitleEnglish: concatenation of English titles (sample appears to concat all titles)
display_title_english = " ".join([t for t in title_stmt if t])
# summary: from msContents/summary or profileDesc/abstract
summary = first_text(root, ".//tei:msContents/tei:summary | .//tei:profileDesc/tei:abstract")
# persName: collect person names in tei:persName nodes (serialize inner markup if present)
pers_nodes = root.xpath(".//tei:persName", namespaces=NS)
pers_list = []
for p in pers_nodes:
# Get all text including nested elements like placeName
txt = ''.join(p.itertext()).strip()
if txt:
# Normalize whitespace
txt = ' '.join(txt.split())
pers_list.append(txt)
# placeName
place_list = text_list(root, ".//tei:placeName")
origin_place_list = text_list(root, ".//tei:origPlace")
# shelfmark: altIdentifier idno type=BL-Shelfmark or altIdentifier/ idno content
# For parts, inherit from document root if not found in part
shelfmarks = text_list(root, ".//tei:altIdentifier/tei:idno[@type='BL-Shelfmark-display']")
if not shelfmarks and part_node is not None:
shelfmarks = text_list(doc_root, ".//tei:altIdentifier/tei:idno[@type='BL-Shelfmark-display'] ")
# finalRubrics
final_rubrics = []
for el in root.xpath(".//tei:finalRubric", namespaces=NS):
final_rubrics.append(html_fragment(el) or (el.text or "").strip())
# colophons: collect text from additions/list/item with label "Colophon"
colophons = []
for el in root.xpath(".//tei:additions//tei:list//tei:item[tei:label/text() = 'Colophon']//tei:quote", namespaces=NS):
v = html_fragment(el)
if not v:
v = (el.text or "").strip()
if v:
colophons.append(v)
# otherLimit: sample used additions / other special fields: try capturing <additions> and their <item> text
other_limit = []
for item in root.xpath(".//tei:additions//tei:item | .//tei:additions//tei:note", namespaces=NS):
t = html_fragment(item)
if not t:
t = (item.text or "").strip()
if t:
other_limit.append(t)
# script: collate @script on handNote/handDesc if present
scripts = text_list(root, ".//tei:handNote/@script | .//tei:handDesc//tei:handNote/@script")
if not scripts:
# fallback: text nodes indicating script
scripts = text_list(root, ".//tei:handNote | .//tei:handDesc")
script_val = " ".join(scripts) if scripts else None
script_lang = map_script_to_language(script_val) if script_val else None
# material: first try @material attribute, then fall back to material element text
material_attr = first_text(root, ".//tei:physDesc//tei:objectDesc//tei:supportDesc/@material")
if material_attr:
material = MATERIALS_MAP.get(material_attr, material_attr)
else:
material = first_text(root, ".//tei:physDesc//tei:objectDesc//tei:supportDesc//tei:material | .//tei:objectDesc//tei:supportDesc//tei:material | .//tei:physDesc//tei:supportDesc//tei:material | .//tei:physDesc//tei:supportDesc//tei:support//tei:material")
# form: from physDesc/objectDesc/@form, capitalized
form = first_text(root, ".//tei:physDesc//tei:objectDesc/@form")
if form:
form = form.capitalize()
# extent: from physDesc/objectDesc/supportDesc/extent/measure (text or @quantity)
extent = first_text(root, ".//tei:physDesc//tei:objectDesc//tei:supportDesc//tei:extent//tei:measure")
# Wright entry number: from msIdentifier/altIdentifier/idno[@type='Wright-BL-Roman']
wright_num = first_text(root, ".//tei:msIdentifier//tei:altIdentifier//tei:idno[@type='Wright-BL-Roman']")
if wright_num:
wright_num = f"[Wright {wright_num}]"
# contents note: from head/note[@type='contents-note']
contents_note = first_text(root, ".//tei:head/tei:note[@type='contents-note']")
# classification: collect descs under listRelation or relation descs (e.g., "Old Testament")
# classification = text_list(root, ".//tei:listRelation//tei:relation/tei:desc | .//tei:listRelation//tei:desc | .//tei:listRelation//tei:relation/tei:desc")
# classification: from head/listRelation[@type='Wright-BL-Taxonomy']/relation/desc
classification = text_list(root, ".//tei:head/tei:listRelation[@type='Wright-BL-Taxonomy']/tei:relation/tei:desc")
# Exclude classifications about composite manuscripts
if classification:
classification = [c for c in classification
if not c.lower().startswith("this unit is a part of a composite manuscript")
and not c.lower().startswith("this composite")
and not c.lower().startswith("this manuscript")]
# date: separate fields for each date type
orig_dates = first_text(root, ".//tei:origDate")
date_not_before = text_list(root, ".//tei:origDate/@notBefore | .//tei:date/@notBefore")
date_not_after = text_list(root, ".//tei:origDate/@notAfter | .//tei:date/@notAfter")
date_when = text_list(root, ".//tei:date/@when")
date_calendar = text_list(root, ".//tei:origDate/@calendar | .//tei:date/@calendar")
# decorations: from decoNote elements
decorations = text_list(root, ".//tei:decoNote")
decoration_types = text_list(root, ".//tei:decoNote/@type")
# author: from msItem/author/persName
authors = text_list(root, ".//tei:msItem//tei:author//tei:persName")
# msItem/author/@ref
authorsUri = text_list(root, ".//tei:msItem//tei:author/@ref")
# incipits: from msItem/incipit
incipits = text_list(root, ".//tei:msItem//tei:incipit")
# explicits: from msItem/explicit
explicits = text_list(root, ".//tei:msItem//tei:explicit")
# script & material shorthand: collapse to strings or lists as in your example
out = {}
if title_stmt: out["titleStmt"] = title_stmt
if ms_item_titles: out["msItemTitle"] = ms_item_titles
if rubrics: out["rubric"] = rubrics
if syr_titles: out["syrTitle"] = syr_titles
if idno: out["idno"] = idno
if part_num: out["partNum"] = part_num
out["displayTitleEnglish"] = display_title_english or ""
if summary: out["summary"] = summary
if pers_list: out["persName"] = pers_list
if place_list: out["placeName"] = place_list
if origin_place_list: out["origPlace"] = origin_place_list
if shelfmarks: out["shelfmark"] = shelfmarks
if final_rubrics: out["finalRubrics"] = final_rubrics
if colophons: out["colophons"] = colophons
if other_limit: out["otherLimit"] = other_limit
if script_val: out["script"] = script_val
if script_lang: out["scriptLanguage"] = script_lang
if material: out["material"] = material
if classification: out["classification"] = classification
if orig_dates: out["origDate"] = orig_dates
if date_not_before: out["dateNotBefore"] = date_not_before
if date_not_after: out["dateNotAfter"] = date_not_after
if date_when: out["dateWhen"] = date_when
if date_calendar: out["dateCalendar"] = date_calendar
if decorations: out["decorations"] = decorations
if decoration_types: out["decorationsType"] = decoration_types
if authors: out["author"] = authors
if authorsUri: out["authorUri"] = authorsUri
if incipits: out["incipit"] = incipits
if explicits: out["explicit"] = explicits
if form: out["form"] = form
if extent: out["extent"] = extent
if wright_num: out["wrightNum"] = wright_num
if contents_note: out["contentsNote"] = contents_note
# Deduplicate all list values
for key, value in out.items():
if isinstance(value, list):
seen = set()
deduped = []
for item in value:
# Normalize whitespace: replace newlines and multiple spaces with single space
if item:
normalized = ' '.join(str(item).split())
if normalized and normalized not in seen:
seen.add(normalized)
deduped.append(normalized)
out[key] = deduped
elif isinstance(value, str):
# Normalize string values too
out[key] = ' '.join(value.split())
return out
def process_file(path: Path):
"""Process a TEI file and return a list of JSON records (one per msPart, or one for the whole doc)."""
parser = etree.XMLParser(recover=True, remove_blank_text=True)
tree = etree.parse(str(path), parser=parser)
root = tree.getroot()
# Check if document has msPart elements
ms_parts = root.xpath(".//tei:msPart", namespaces=NS)
results = []
if ms_parts:
# Process each msPart as a separate record
for part in ms_parts:
data = extract_json(tree, part_node=part)
results.append(data)
else:
# No parts, process the whole document
data = extract_json(tree)
results.append(data)
return results
def main():
ap = argparse.ArgumentParser()
ap.add_argument("path", help="TEI XML file or directory")
ap.add_argument("--outdir", "-o", help="directory to write per-file JSON outputs")
ap.add_argument("--bulk", help="write an OpenSearch bulk file (newline-delimited index JSON + doc JSON)")
ap.add_argument("--manuscripts", help="write a manuscripts.json array file for web UI")
ap.add_argument("--index", default="britishlibrary-index-1", help="index name for bulk")
ap.add_argument("--idprefix", default="ms", help="prefix for _id in bulk (e.g., ms)")
args = ap.parse_args()
p = Path(args.path)
targets = []
if p.is_dir():
targets = sorted(p.glob("*.xml"))
elif p.is_file():
targets = [p]
else:
raise SystemExit("Path not found")
os.makedirs(args.outdir or ".", exist_ok=True)
bulk_writer = None
if args.bulk:
bulk_writer = open(args.bulk, "w", encoding="utf8")
manuscripts_list = []
for f in targets:
try:
records = process_file(f)
except Exception as e:
print(f"ERROR parsing {f}: {e}")
continue
fname = f.stem
# Process each record (could be multiple if file has msParts)
for idx, j in enumerate(records):
# Generate unique ID for parts
if len(records) > 1:
record_id = f"{fname}-part{idx+1}"
else:
record_id = fname
# if outdir requested, write each JSON
if args.outdir:
outp = Path(args.outdir) / (record_id + ".json")
with open(outp, "w", encoding="utf8") as fh:
json.dump(j, fh, ensure_ascii=False, indent=2)
print(f"Wrote {outp}")
# if bulk requested, write two-line bulk entry
if bulk_writer:
meta = {"index": {"_index": args.index, "_id": f"{args.idprefix}-{record_id}"}}
bulk_writer.write(json.dumps(meta, ensure_ascii=False) + "\n")
bulk_writer.write(json.dumps(j, ensure_ascii=False) + "\n")
# collect for manuscripts array
if args.manuscripts:
j["id"] = f"{args.idprefix}-{record_id}"
# Deduplicate and clean fields
for key, value in j.items():
if isinstance(value, list):
seen = set()
deduped = []
for item in value:
if item and item not in seen:
seen.add(item)
deduped.append(item)
j[key] = deduped
# Remove composite manuscript classification
if 'classification' in j and j['classification']:
if isinstance(j['classification'], list):
j['classification'] = [c for c in j['classification']
if not (isinstance(c, str) and c.startswith('This unit is a part of a composite manuscript'))]
manuscripts_list.append(j)
if bulk_writer:
bulk_writer.close()
print(f"Wrote bulk file {args.bulk}")
if args.manuscripts:
with open(args.manuscripts, "w", encoding="utf8") as fh:
json.dump(manuscripts_list, fh, ensure_ascii=False, indent=2)
print(f"Wrote manuscripts file {args.manuscripts}")
# If single file with no output flags, print to stdout
if p.is_file() and not args.outdir and not args.bulk and not args.manuscripts and len(targets) == 1:
records = process_file(p)
for j in records:
print(json.dumps(j, ensure_ascii=False, indent=2))
if __name__ == "__main__":
main()