diff --git a/bin/parseRefStrings.pl b/bin/parseRefStrings.pl index 3cfed23..2495014 100755 --- a/bin/parseRefStrings.pl +++ b/bin/parseRefStrings.pl @@ -7,8 +7,12 @@ # Min-Yen Kan (Thu Feb 28 14:10:28 SGT 2008) # Derived from citeExtract.pl # +# Matthias Bösinger (29.03.2016) +# -> all changes marked with: MB1 + use strict; use FindBin; +use Getopt::Long; use lib "$FindBin::Bin/../lib"; use lib "/home/wing.nus/tools/languages/programming/perl-5.10.0/lib/5.10.0"; @@ -16,16 +20,40 @@ use ParsCit::Controller; use CSXUtil::SafeText qw(cleanAll cleanXML); +use ParsCit::ConfigLang; + +### +# set standard encoding to UTF-8 +# MB1 +### +binmode STDIN, ":encoding(UTF-8)"; +binmode STDOUT, ":encoding(UTF-8)"; + +### Get additional parameter (language parameter (english as default), split parameter, keep temp files parameter) - MB1 +my $lang = "en"; +my $split = ''; +my $keep = ''; +if (!GetOptions("lang=s" => \$lang, "split" => \$split, "keep" => \$keep)) { + print "Usage: $0 textfile outfile [-lang=en|de] [-split] [-keep]\n"; + exit; +} +# initialize language config +if (!ParsCit::ConfigLang::Init($lang)) { + print "Usage: $0 textfile outfile [-lang=en|de] [-split] [-keep]\n"; + exit; +} +### End (additional parameter) - MB1 my $textFile = $ARGV[0]; my $outFile = $ARGV[1]; if (!defined $textFile) { - print "Usage: $0 textfile [outfile]\n"; + print "Usage: $0 textfile outfile [-lang=en|de] [-split] [-keep]\n"; # Updated - MB1 exit; } -open (IF, $textFile) || die "Couldn't open text file \"textFile\"!"; +# open (IF, $textFile) || die "Couldn't open text file \"textFile\"!"; +open (IF, "<:utf8", $textFile) || die "Couldn't open text file \"textFile\"!"; # set to utf-8-encoding - MB1 my $normalizedCiteText = ""; my $line = 0; while () { @@ -42,16 +70,16 @@ } our $msg = ""; -my $tmpFile = ParsCit::Tr2crfpp::PrepData(\$normalizedCiteText, $textFile); -my $outFile = $tmpFile."_dec"; +my $tmpFile = ParsCit::Tr2crfpp::PrepData(\$normalizedCiteText, $textFile, $split); # Additional parameter 'split' - MB1 +my $outTmpFile = $tmpFile."_dec"; # Changed name from '$outFile'. Otherwise name conflict with 'outFile' from $ARGV[1]; - MB1 my @validCitations = (); my $xml = ""; $xml .= "\n"; $xml .= "\n"; -if (ParsCit::Tr2crfpp::Decode($tmpFile, $outFile)) { +if (ParsCit::Tr2crfpp::Decode($tmpFile, $outTmpFile)) { my ($rRawXML, $rCiteInfo, $tstatus, $tmsg) = - ParsCit::PostProcess::ReadAndNormalize($outFile); + ParsCit::PostProcess::ReadAndNormalize($outTmpFile); if ($tstatus <= 0) { return ($tstatus, $msg, undef, undef); } @@ -60,15 +88,16 @@ my %citeInfo = %{$citeInfo[$i]}; $xml .= "\n"; foreach my $key (keys %citeInfo) { - if ($key eq "authors" || $key eq "editors") { - my $singular = $key; - chop $singular; - $xml .= "<$key>\n"; - foreach my $person (@{$citeInfo{$key}}) { - cleanAll(\$person); - $xml .= "<$singular>$person\n"; - } - $xml .= "\n"; + if ($key eq "authors" || $key eq "editors") + { + my $singular = $key; + chop $singular; + $xml .= "<$key>\n"; + foreach my $person (@{$citeInfo{$key}}) { + cleanAll(\$person); + $xml .= "<$singular>$person\n"; + } + $xml .= "\n"; } elsif ($key eq "volume") { @@ -96,7 +125,24 @@ $xml .= "\n\n"; } -unlink($tmpFile); -unlink($outFile); +### +# tmp-files are kept if parameter has been set. +# MB1 +### +unless ($keep) { + unlink($tmpFile); + unlink($outTmpFile); +} + +### +# If outFile has been passed as parameter the result .xml will be print to this file. +# Else the result .xml will be print to standard out. +# MB1 +### +if (open(OUT, ">:utf8", $outFile)) { + print OUT $xml; +} +else { + print $xml; +} -print $xml; diff --git a/bin/prepData.pl b/bin/prepData.pl new file mode 100755 index 0000000..8fd82cd --- /dev/null +++ b/bin/prepData.pl @@ -0,0 +1,58 @@ +#!/usr/bin/perl -CSD +# +# Script to transform reference strings to crf++ compatible data. +# By doing so, CRF++ can be used manually with the trasnformed reference data. +# +# Derived from 'parseRefStrings.pl' +# +# written by Matthias Bösinger (29.03.2016) + +use strict; +use FindBin; +use lib "$FindBin::Bin/../lib"; + +use lib "/home/wing.nus/tools/languages/programming/perl-5.10.0/lib/5.10.0"; +use lib "/home/wing.nus/tools/languages/programming/perl-5.10.0/lib/site_perl/5.10.0"; + +use ParsCit::Controller; +use CSXUtil::SafeText qw(cleanAll cleanXML); + +my $textFile = $ARGV[0]; +my $outFile = $ARGV[1]; + +if (!defined $textFile || !defined $outFile) { + print "Usage: $0 textfile outfile\n"; + exit; +} + +open (IF, $textFile) || die "Couldn't open text file \"textFile\"!"; +my $normalizedCiteText = ""; +my $line = 0; +while () { + chop; + # Tr2cfpp needs an enclosing tag for initial class seed. + $normalizedCiteText .= " " . $_ . " \n"; + $line++; +} +close (IF); + +if ($line == 0) { + # Stop - nothing left to do. + exit(); +} + +our $msg = ""; +my $tmpFile = ParsCit::Tr2crfpp::PrepData(\$normalizedCiteText, $textFile); + +open (TF, $tmpFile) || die "Couldn't open tmp file!"; +open (OF, ">$outFile") || die "Couldn't open out file!"; +while () { + chop; + print OF $_ . "\n"; +} + +close(TF); +close(OF); + +unlink($tmpFile); + diff --git a/bin/tr2crfpp.pl b/bin/tr2crfpp.pl index c9af840..a9e3541 100755 --- a/bin/tr2crfpp.pl +++ b/bin/tr2crfpp.pl @@ -1,5 +1,9 @@ #!/usr/bin/env perl # -*- cperl -*- +# +# Matthias Bösinger (29.03.2016) +# -> all changes marked with: MB1 + =head1 NAME tr2crfpp.pl @@ -22,7 +26,13 @@ =head1 HISTORY require 5.0; use Getopt::Std; use strict 'vars'; +use utf8; +use 5.010; # MB1 use FindBin; + +binmode STDIN, ":encoding(UTF-8)"; # MB1 +binmode STDOUT, ":encoding(UTF-8)"; # MB1 + # use diagnostics; ### USER customizable section @@ -93,21 +103,29 @@ sub License { if ($filename = shift) { NEWFILE: if (!(-e $filename)) { die "# $progname crash\t\tFile \"$filename\" doesn't exist"; } - open (*IF, $filename) || die "# $progname crash\t\tCan't open \"$filename\""; - $fh = "IF"; + open (IF, "<:encoding(UTF-8)", $filename) || die "# $progname crash\t\tCan't open \"$filename\""; # set encoding to UTF-8 - MB1 +# open (*IF, $filename) || die "# $progname crash\t\tCan't open \"$filename\""; +# $fh = "IF"; } else { $filename = ""; $fh = "STDIN"; } -while (<$fh>) { +# while (<$fh>) { # set encoding to UTF-8 - MB1 +while () { # set encoding to UTF-8 - MB1 if (/^\#/) { next; } # skip comments elsif (/^\s+$/) { next; } # skip blank lines else { my $tag = ""; my @tokens = split(/ +/); my @feats = (); - my $hasPossibleEditor = (/(ed\.|editor|editors|eds\.)/) ? "possibleEditors" : "noEditors"; + + ### + # Regex updated accordingly to changes in Tr2crfpp.pm by Artemy Kolchinsky and new german editor strings in ConfigLang.pm + # MB1 + ### + my $hasPossibleEditor = (/[^A-Za-z](ed\.?|editor|editors|eds\.?|Hrsg\.?|Herausgeber|Hg\.?|hgg\.?)/i) ? "possibleEditors" : "noEditors"; + my $j = 0; for (my $i = 0; $i <= $#tokens; $i++) { # for (my $i = $#tokens; $i >= 0; $i--) { diff --git a/lib/ParsCit/ConfigLang.pm b/lib/ParsCit/ConfigLang.pm new file mode 100644 index 0000000..18e8180 --- /dev/null +++ b/lib/ParsCit/ConfigLang.pm @@ -0,0 +1,63 @@ +package ParsCit::ConfigLang; + +################ +# Written By Matthias Bösinger (29.03.2016) +# +# Modul is used to set language specific data fields. +# Call of 'Init' with language type passed as parameter, will cause the initialization of the gloabl data fields. +# hasEditor: Regex used in feature determination to decide if a reference contains editor tokens. +# authorSplit: Regex used to split contiguous as author tags labeled tokens, into several author-names. +# authorDelete: Regex used to delete parts of an as author tag labeled token. +# inMarker: not in use in this version -> could be used for additional feature that marks a collective volume in the reference string +################ + +use utf8; + +## Global +$hasEditorRegex = ''; +$authorSplitRegex = ''; +$authorDeleteRegex = ''; +$inMarker = ''; + +## Language specific data +my %enData = ( 'editor' => '[^A-Za-z](ed\.?|editor|editors|eds\.?)', + 'author' => '^(&|/|and|a\.)$', + 'delete' => 'et\.? al\.?.*$', + 'in' => 'in' ); + +my %deData = ( 'editor' => '[^A-Za-z](Hrsg\.?|Herausgeber|Hg\.?|hgg\.?)', + 'author' => '^(&|/|und|u\.)$', + 'delete' => '(u\.a\..*|et\.? al\.?.*)$', + 'in' => 'in' ); + + +## initialization methods +sub Init { + my ($lang) = @_; + + if ($lang eq "en") { + initData(%enData); + } + elsif ($lang eq "de") { + initData(%deData); + } + #additional languages might be included here - MB + else { + return 0; + } + + 1; +} + + +sub initData { + my (%data) = @_; + + $hasEditorRegex = $data{'editor'}; + $authorSplitRegex = $data{'author'}; + $authorDeleteRegex = $data{'delete'}; + $inMarker = $data{'in'}; + +} + +1; diff --git a/lib/ParsCit/PostProcess.pm b/lib/ParsCit/PostProcess.pm index ccd7e27..624d61c 100644 --- a/lib/ParsCit/PostProcess.pm +++ b/lib/ParsCit/PostProcess.pm @@ -5,6 +5,9 @@ package ParsCit::PostProcess; # representations. # # Isaac Councill, 07/20/07 +# +# Matthias Bösinger (29.03.2016) +# -> all changes marked with: MB1 ### use utf8; @@ -140,6 +143,17 @@ sub NormalizeFields # Content is a reference to a list of author $content = NormalizeAuthorNames($content); } + ### + # The NormalizeAuthorNames procedure is also used for editor strings. + # Single editors are nested within an 'editors' tag, each placed in a single 'editor' tag + # MB1 + ### + elsif ($tag eq "editor") + { + $tag = "editors"; + # Content is a reference to a list of author + $content = NormalizeAuthorNames($content); + } elsif ($tag eq "date") { $content = NormalizeDate($content); @@ -166,7 +180,21 @@ sub NormalizeFields # Heuristic - only get first instance of tag. # TODO: we can do better than that... - unless (defined $cite_info{ $tag }) { $cite_info{ $tag } = $content; } +# unless (defined $cite_info{ $tag }) { $cite_info{ $tag } = $content; } + + ### + # At least authors and editors are insorted into existing result lists + # MB1 + ### + if (defined $cite_info{ $tag }) { + if ($tag eq "authors" || $tag eq "editors") { + my $currentlist_ref = \@{$cite_info{ $tag }}; + push @{$currentlist_ref}, @{$content}; + } + } + else { + $cite_info{ $tag } = $content; + } } push @cite_infos, \%cite_info; @@ -269,7 +297,13 @@ sub NormalizeAuthorNames foreach my $tok (@tokens) { - if ($tok =~ m/^(&|and)$/i) + ### + # language specific author split strings are loaded from ParsCit::ConfigLang + # MB1 + ### + my $author_split_regex = $ParsCit::ConfigLang::authorSplitRegex; + + if ($tok =~ m/$author_split_regex/i) { if ($#current_auth >= 0) { @@ -323,7 +357,17 @@ sub RepairAndTokenizeAuthorText my ($author_text) = @_; # Repair obvious parse errors and weird notations. - $author_text =~ s/et\.? al\.?.*$//; + ### + # language specific author parts to delete are loaded from ParsCit::ConfigLang. + # 1) 'et al' ... and other words + # 2) 'in' ... as can be found at start of editor text + # MB1 + ### + my $to_delete_strings = $ParsCit::ConfigLang::authorDeleteRegex; + $author_text =~ s/$to_delete_strings//; + my $inpart = $ParsCit::ConfigLang::inMarker; + $author_text =~ s/^\s*$inpart:?\s*//i; + $author_text =~ s/^.*?[\p{IsUpper}\p{IsLower}][\p{IsUpper}\p{IsLower}]+\. //; $author_text =~ s/\(.*?\)//g; $author_text =~ s/^.*?\)\.?//g; @@ -336,7 +380,9 @@ sub RepairAndTokenizeAuthorText $author_text =~ s/;/,/g; $author_text =~ s/,/, /g; $author_text =~ s/\:/ /g; - $author_text =~ s/[\:\"\<\>\/\?\{\}\[\]\+\=\(\)\*\^\%\$\#\@\!\~\_]//g; +# $author_text =~ s/[\:\"\<\>\/\?\{\}\[\]\+\=\(\)\*\^\%\$\#\@\!\~\_]//g; + $author_text =~ s/[\:\"\<\>\?\{\}\[\]\+\=\(\)\*\^\%\$\#\@\!\~\_]//g; #keep 'slash' since slash is often used as name seperator - MB1 + $author_text = JoinMultiWordNames($author_text); my @orig_tokens = split '\s+', $author_text; @@ -345,7 +391,8 @@ sub RepairAndTokenizeAuthorText for (my $i=0; $i <= $#orig_tokens; $i++) { my $tok = $orig_tokens[$i]; - if ($tok !~ m/[\p{IsUpper}\p{IsLower}&]/) + + if ($tok !~ m/[\p{IsUpper}\p{IsLower}&\/]/) #keep 'slash' since slash is often used as name seperator - MB1 { if ($i < $#orig_tokens/2) { @@ -445,7 +492,8 @@ sub NormalizeAuthorName2 sub JoinMultiWordNames { my $author_text = shift; - $author_text =~ s/\b((?:van|von|der|den|de|di|le|el))\s/$1_/sgi; # Thang 02 Mar 10: change \1 into \$1 +# $author_text =~ s/\b((?:van|von|der|den|de|di|le|el))\s/$1_/sgi; # Thang 02 Mar 10: change \1 into \$1 + $author_text =~ s/\b((?:van|von|der|den|de|di|le|el|zu|zur|vom|zum|und|dem|d'|del|da|degli|dalla|te|ter|of|v.|d.|z.))\s/$1_/sgi; # added more multi word components - MB1 return $author_text; } diff --git a/lib/ParsCit/Tr2crfpp.pm b/lib/ParsCit/Tr2crfpp.pm index ef8d40f..85934cd 100644 --- a/lib/ParsCit/Tr2crfpp.pm +++ b/lib/ParsCit/Tr2crfpp.pm @@ -7,6 +7,9 @@ package ParsCit::Tr2crfpp; # # Copyright 2005 \251 by Min-Yen Kan (not sure what this means for IGC edits, but # what the hell -IGC) +# +# Matthias Bösinger (29.03.2016) +# -> all changes marked with: MB1 ### use utf8; @@ -17,6 +20,7 @@ use Encode (); use Omni::Config; use ParsCit::Config; +use ParsCit::ConfigLang; #MB1 ### USER customizable section my $tmp_dir = $ParsCit::Config::tmpDir; @@ -679,7 +683,7 @@ sub PrepDataUnmarkedToken # Prepare data for trfpp sub PrepData { - my ($rcite_text, $filename) = @_; + my ($rcite_text, $filename, $split) = @_; # Additional parameter 'split' - MB1 # Generate a temporary file my $tmpfile = BuildTmpFile($filename); @@ -700,6 +704,18 @@ sub PrepData # Skip blank lines if (/^\s*$/) { next; } + ### + # Insert extra whitespace: + # 1) around dots, which are preceed by a letter and succeeded by capital letter, + # 2) around slashs, which are preceed by a letter, a semicolon or a dot and succeeded by a letter, a semicolon or a dot. + # This may occur in case of author strings. These author strings will not be processed correctly if no whitespace insertion is done. + # MB1 + ### + if ($split == 1) { + s/(\p{L})\.(\p{isUpper})/$1. $2/g; # Rule 1) + s/([\p{L}\.;]) ?\/ ?([\p{L}\.;])/$1 \/ $2/g; # Rule 2) + } + my $tag = ""; my @tokens = split(/ +/); my @feats = (); @@ -710,7 +726,15 @@ sub PrepData # I changed this string to match 'ed.', 'editor', 'editors', and 'eds.' if *not* # preceeded by an alphabetic character. ### - my $has_possible_editor = (/[^A-Za-z](ed\.|editor|editors|eds\.)/) ? "possibleEditors" : "noEditors"; +# my $has_possible_editor = (/[^A-Za-z](ed\.|editor|editors|eds\.)/) ? "possibleEditors" : "noEditors"; + + ### + # language specific editor strings are loaded from ParsCit::ConfigLang + # And: regex set to case insensitive + # MB1 + ### + my $has_editor_regex = $ParsCit::ConfigLang::hasEditorRegex; + my $has_possible_editor = (/$has_editor_regex/i) ? "possibleEditors" : "noEditors"; my $j = 0; for (my $i = 0; $i <= $#tokens; $i++) @@ -958,7 +982,7 @@ sub PrepData ($word =~ /^[0-9]{2-5}\([0-9]{2-5}\).?$/) ? "possibleVol" : "others"; # 22 = punctuation push(@{ $feats[ $j ] }, $punct); - + # output tag push(@{ $feats[ $j ] }, $tag);