From fd6710ca0a43454b7dd26f552758d037c686ff16 Mon Sep 17 00:00:00 2001 From: Joshua Huelsman Date: Wed, 27 May 2026 14:04:21 -0400 Subject: [PATCH 1/5] Add SenseExample SQL table type. --- jamdict/data/setup_jmdict.sql | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/jamdict/data/setup_jmdict.sql b/jamdict/data/setup_jmdict.sql index 94c4427..315984f 100644 --- a/jamdict/data/setup_jmdict.sql +++ b/jamdict/data/setup_jmdict.sql @@ -182,6 +182,14 @@ CREATE TABLE SenseGloss ( ,FOREIGN KEY (sid) REFERENCES Sense(id) ); +CREATE TABLE SenseExample ( + sid INTEGER + ,text TEXT + ,sentence_jpn + ,sentence_eng + ,FOREIGN KEY (sid) REFERENCES Sense(id) +); + ------------------------------------------------------------------------------------- -- INDICES - JMDict ------------------------------------------------------------------------------------- @@ -231,3 +239,7 @@ CREATE INDEX SenseGloss_sid ON SenseGloss(sid); CREATE INDEX SenseGloss_lang ON SenseGloss(lang); CREATE INDEX SenseGloss_gend ON SenseGloss(gend); CREATE INDEX SenseGloss_text ON SenseGloss(text); +CREATE INDEX SenseExample_sid ON SenseExample(sid); +CREATE INDEX SenseExample_text ON SenseExample(text); +CREATE INDEX SenseExample_sentence_jpn ON SenseExample(sentence_jpn); +CREATE INDEX SenseExample_sentence_eng ON SenseExample(sentence_eng); From f0c322ca6623868b786a9c5bd685f17251382958 Mon Sep 17 00:00:00 2001 From: Joshua Huelsman Date: Wed, 27 May 2026 14:07:14 -0400 Subject: [PATCH 2/5] Add SenseExample class and parsing. --- jamdict/jmdict.py | 57 ++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 54 insertions(+), 3 deletions(-) diff --git a/jamdict/jmdict.py b/jamdict/jmdict.py index a2232ce..3286b40 100644 --- a/jamdict/jmdict.py +++ b/jamdict/jmdict.py @@ -410,8 +410,7 @@ def __init__(self): target-language phrases or sentences which exemplify the usage of the Japanese head-word and the target-language gloss. Words in example fields would typically not be indexed by a dictionary application.''' - # It seems that this field is not used anymore! - self.examples = [] # + self.examples: List[SenseExample] = [] # def __repr__(self): return str(self) @@ -455,6 +454,8 @@ def to_dict(self): sd['dialect'] = self.dialect if self.gloss: sd['SenseGloss'] = [x.to_dict() for x in self.gloss] + if self.examples: + sd['examples'] = self.examples return sd @@ -541,6 +542,38 @@ def to_dict(self): return gd +class SenseExample(object): + '''The example elements contain a Japanese sentence using the term + associated with the entry, and one or more translations of that sentence. + Within the element, the ex_srce element will indicate the source of the + sentences (typically the sequence number in the Tatoeba Project), the + ex_text element will contain the form of the term in the Japanese + sentence, and the ex_sent elements contain the example sentences.''' + def __init__(self, text, sentences): + self.text = text + self.sentences = sentences + + def __str__(self): + tmp = [self.text] + + if len(self.sentences) > 0: + tmp.append(self.sentences['jpn']) + + for lang,sent in self.sentences.items(): + if lang != 'jpn': + tmp.append('(lang:%s) %s' % (lang, sent)) + + return ' - '.join(tmp) + + def to_dict(self): + gd = {} + if self.text: + gd['text'] = self.text + if self.sentences: + gd['sentences'] = self.sentences + return gd + + class LSource: '''This element records the information about the source language(s) of a loan-word/gairaigo. If the source language is other @@ -780,7 +813,7 @@ def parse_sense(self, sense_tag, entry): elif child.tag == 'dial': sense.dialect.append(child.text) elif child.tag == 'example': - sense.examples.append(child.text) + self.parse_example(child, sense) elif child.tag == 'lsource': self.parse_lsource(child, sense) elif child.tag == 'gloss': @@ -798,6 +831,24 @@ def get_attrib(self, a_tag, attr_name, default_value=''): else: return default_value + def parse_example(self, example_tag, sense): + srce = '' + text = '' + sentences = {} + + for child in example_tag: + if child.tag == 'ex_srce': + srce = child.text + elif child.tag == 'ex_text': + text = child.text + elif child.tag == 'ex_sent': + lang = self.get_attrib(child, 'xml:lang') + sentences[lang] = child.text + + example = SenseExample(text, sentences) + sense.examples.append(example) + return example + def parse_sensegloss(self, gloss_tag, sense): lang = self.get_attrib(gloss_tag, 'xml:lang') gend = self.get_attrib(gloss_tag, 'g_gend') From 35c7829712da1cf03df4226871b93f0123460b1c Mon Sep 17 00:00:00 2001 From: Joshua Huelsman Date: Wed, 27 May 2026 14:08:11 -0400 Subject: [PATCH 3/5] Add SQL table encoding and decoding --- jamdict/jmdict_sqlite.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/jamdict/jmdict_sqlite.py b/jamdict/jmdict_sqlite.py index bde14c4..b25a852 100644 --- a/jamdict/jmdict_sqlite.py +++ b/jamdict/jmdict_sqlite.py @@ -13,7 +13,7 @@ from puchikarui import Schema from . import __version__ as JAMDICT_VERSION, __url__ as JAMDICT_URL -from .jmdict import Meta, JMDEntry, EntryInfo, Link, BibInfo, Audit, KanjiForm, KanaForm, Sense, SenseGloss, LSource +from .jmdict import Meta, JMDEntry, EntryInfo, Link, BibInfo, Audit, KanjiForm, KanaForm, Sense, SenseGloss, SenseExample, LSource # ------------------------------------------------------------------------------- # Configuration @@ -82,6 +82,7 @@ def __init__(self, db_path, *args, **kwargs): self.add_table('SenseSource', ['sid', 'text', 'lang', 'lstype', 'wasei']) self.add_table('dialect', ['sid', 'text']) self.add_table('SenseGloss', ['sid', 'lang', 'gend', 'text']) + self.add_table('SenseExample', ['sid', 'text', 'sentence_jpn', 'sentence_eng']) class JMDictSQLite(JMDictSchema): @@ -268,6 +269,10 @@ def get_entry(self, idseq, ctx=None): gs = ctx.SenseGloss.select('sid=?', (dbs.ID,)) for g in gs: s.gloss.append(SenseGloss(g.lang, g.gend, g.text)) + # examples + exs = ctx.SenseExample.select('sid=?', (dbs.ID,)) + for e in exs: + s.examples.append(SenseExample(e.text, {"jpn":e.sentence_jpn, "eng":e.sentence_eng})) entry.senses.append(s) return entry @@ -359,3 +364,5 @@ def insert_entry(self, entry, ctx=None): # SenseGloss for g in s.gloss: ctx.SenseGloss.insert(sid, g.lang, g.gend, g.text) + for e in s.examples: + ctx.SenseExample.insert(sid, e.text, e.sentences['jpn'], e.sentences['eng']) \ No newline at end of file From afb5cf6d152a8c450720208addc1fde59f453c37 Mon Sep 17 00:00:00 2001 From: Joshua Huelsman Date: Wed, 27 May 2026 14:08:44 -0400 Subject: [PATCH 4/5] Add printing examples using lookup.py tool. --- jamdict/tools.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/jamdict/tools.py b/jamdict/tools.py index 29ac84a..581885b 100755 --- a/jamdict/tools.py +++ b/jamdict/tools.py @@ -98,6 +98,9 @@ def dump_result(results, report=None): report.print("-" * 20) for idx, s in enumerate(e.senses): report.print("{idx}. {s}".format(idx=idx + 1, s=s)) + for idx, e in enumerate(s.examples): + report.print("\t{idx}. {e}".format(idx=idx + 1, e=e.sentences['jpn'])) + report.print("\t {e:>{spaces}}".format(spaces=len(str(idx))+len(e.sentences['eng']), e=e.sentences['eng'])) report.print('') else: report.print("No dictionary entry was found.") From 44cb191d3367860750172c83c7ad296edbaf11bf Mon Sep 17 00:00:00 2001 From: Joshua Huelsman Date: Wed, 27 May 2026 15:45:11 -0400 Subject: [PATCH 5/5] Modify some comments --- jamdict/jmdict.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/jamdict/jmdict.py b/jamdict/jmdict.py index 3286b40..1f52e7a 100644 --- a/jamdict/jmdict.py +++ b/jamdict/jmdict.py @@ -406,10 +406,6 @@ def __init__(self): self.gloss: List[SenseGloss] = [] # - '''The example elements provide for pairs of short Japanese and - target-language phrases or sentences which exemplify the usage of the - Japanese head-word and the target-language gloss. Words in example - fields would typically not be indexed by a dictionary application.''' self.examples: List[SenseExample] = [] # def __repr__(self): @@ -548,7 +544,14 @@ class SenseExample(object): Within the element, the ex_srce element will indicate the source of the sentences (typically the sequence number in the Tatoeba Project), the ex_text element will contain the form of the term in the Japanese - sentence, and the ex_sent elements contain the example sentences.''' + sentence, and the ex_sent elements contain the example sentences. + DTD: + + + + + + ''' def __init__(self, text, sentences): self.text = text self.sentences = sentences