knmnyn · maboberlin · Apr 1, 2016 · Jan 7, 2017
diff --git a/bin/parseRefStrings.pl b/bin/parseRefStrings.pl
@@ -7,25 +7,53 @@
 # Min-Yen Kan (Thu Feb 28 14:10:28 SGT 2008)
 #  Derived from citeExtract.pl
 #
+# Matthias Bösinger (29.03.2016)
+# -> all changes marked with: MB1
+
 use strict;
 use FindBin;
+use Getopt::Long;
 use lib "$FindBin::Bin/../lib";
 
 use lib "/home/wing.nus/tools/languages/programming/perl-5.10.0/lib/5.10.0";
 use lib "/home/wing.nus/tools/languages/programming/perl-5.10.0/lib/site_perl/5.10.0";
 
 use ParsCit::Controller;
 use CSXUtil::SafeText qw(cleanAll cleanXML);
+use ParsCit::ConfigLang;
+
+###
+# set standard encoding to UTF-8 
+# MB1
+###
+binmode STDIN, ":encoding(UTF-8)";
+binmode STDOUT, ":encoding(UTF-8)";
+
+### Get additional parameter (language parameter (english as default), split parameter, keep temp files parameter) - MB1
+my $lang = "en";
+my $split = '';
+my $keep = '';
+if (!GetOptions("lang=s" => \$lang, "split" => \$split, "keep" => \$keep)) {
+	print "Usage: $0 textfile outfile [-lang=en|de] [-split] [-keep]\n";
+    exit;
+}
+# initialize language config
+if (!ParsCit::ConfigLang::Init($lang)) {
+	print "Usage: $0 textfile outfile [-lang=en|de] [-split] [-keep]\n";
+    exit;
+}
+### End (additional parameter) - MB1
 
 my $textFile = $ARGV[0];
 my $outFile = $ARGV[1];
 
 if (!defined $textFile) {
-    print "Usage: $0 textfile [outfile]\n";
+    print "Usage: $0 textfile outfile [-lang=en|de] [-split] [-keep]\n";	# Updated - MB1
     exit;
 }
 
-open (IF, $textFile) || die "Couldn't open text file \"textFile\"!";
+# open (IF, $textFile) || die "Couldn't open text file \"textFile\"!";
+open (IF, "<:utf8", $textFile) || die "Couldn't open text file \"textFile\"!"; 	# set to utf-8-encoding - MB1
 my $normalizedCiteText = "";
 my $line = 0;
 while (<IF>) {
@@ -42,16 +70,16 @@
 }
 
 our $msg = "";
-my $tmpFile = ParsCit::Tr2crfpp::PrepData(\$normalizedCiteText, $textFile);
-my $outFile = $tmpFile."_dec";
+my $tmpFile = ParsCit::Tr2crfpp::PrepData(\$normalizedCiteText, $textFile, $split); 	# Additional parameter 'split' - MB1
+my $outTmpFile = $tmpFile."_dec"; 	# Changed name from '$outFile'. Otherwise name conflict with 'outFile' from $ARGV[1]; - MB1
 my @validCitations = ();
 
 my $xml = "";
 $xml .= "<algorithm name=\"$ParsCit::Config::algorithmName\" version=\"$ParsCit::Config::algorithmVersion\">\n";
 $xml .= "<citationList>\n";
-if (ParsCit::Tr2crfpp::Decode($tmpFile, $outFile)) {
+if (ParsCit::Tr2crfpp::Decode($tmpFile, $outTmpFile)) {
     my ($rRawXML, $rCiteInfo, $tstatus, $tmsg) =
-	ParsCit::PostProcess::ReadAndNormalize($outFile);
+	ParsCit::PostProcess::ReadAndNormalize($outTmpFile);
     if ($tstatus <= 0) {
 	return ($tstatus, $msg, undef, undef);
     }
@@ -60,15 +88,16 @@
 	my %citeInfo = %{$citeInfo[$i]};
 	$xml .= "<citation>\n";
 	foreach my $key (keys %citeInfo) {
-	    if ($key eq "authors" || $key eq "editors") {
-		my $singular = $key;
-		chop $singular;
-		$xml .= "<$key>\n";
-		foreach my $person (@{$citeInfo{$key}}) {
-			cleanAll(\$person);
-		    $xml .= "<$singular>$person</$singular>\n";
-		}
-		$xml .= "</$key>\n";
+	    if ($key eq "authors" || $key eq "editors") 
+	    {
+			my $singular = $key;
+			chop $singular;
+			$xml .= "<$key>\n";
+			foreach my $person (@{$citeInfo{$key}}) {
+				cleanAll(\$person);
+				$xml .= "<$singular>$person</$singular>\n";
+			}
+			$xml .= "</$key>\n";
 	    } 
 		elsif ($key eq "volume") 
 		{
@@ -96,7 +125,24 @@
     $xml .= "</citationList>\n</algorithm>\n";
 }
 
-unlink($tmpFile);
-unlink($outFile);
+###
+# tmp-files are kept if parameter has been set.
+# MB1
+###
+unless ($keep) { 
+	unlink($tmpFile); 
+	unlink($outTmpFile);
+}
+
+###
+# If outFile has been passed as parameter the result .xml will be print to this file.
+# Else the result .xml will be print to standard out.
+# MB1
+###
+if (open(OUT, ">:utf8", $outFile)) {
+	print OUT $xml;
+}
+else {
+	print $xml;
+}
 
-print $xml;
diff --git a/bin/prepData.pl b/bin/prepData.pl
@@ -0,0 +1,58 @@
+#!/usr/bin/perl -CSD
+#
+# Script to transform reference strings to crf++ compatible data.
+# By doing so, CRF++ can be used manually with the trasnformed reference data.
+#
+# Derived from 'parseRefStrings.pl'
+#
+# written by Matthias Bösinger (29.03.2016)
+
+use strict;
+use FindBin;
+use lib "$FindBin::Bin/../lib";
+
+use lib "/home/wing.nus/tools/languages/programming/perl-5.10.0/lib/5.10.0";
+use lib "/home/wing.nus/tools/languages/programming/perl-5.10.0/lib/site_perl/5.10.0";
+
+use ParsCit::Controller;
+use CSXUtil::SafeText qw(cleanAll cleanXML);
+
+my $textFile = $ARGV[0];
+my $outFile = $ARGV[1];
+
+if (!defined $textFile || !defined $outFile) {
+    print "Usage: $0 textfile outfile\n";
+    exit;
+}
+
+open (IF, $textFile) || die "Couldn't open text file \"textFile\"!";
+my $normalizedCiteText = "";
+my $line = 0;
+while (<IF>) {
+  chop;
+  # Tr2cfpp needs an enclosing tag for initial class seed.
+  $normalizedCiteText .= "<title> " . $_ . " </title>\n";
+  $line++;
+}
+close (IF);
+
+if ($line == 0) {
+  # Stop - nothing left to do.
+  exit();
+}
+
+our $msg = "";
+my $tmpFile = ParsCit::Tr2crfpp::PrepData(\$normalizedCiteText, $textFile);
+
+open (TF, $tmpFile) || die "Couldn't open tmp file!";
+open (OF, ">$outFile") || die "Couldn't open out file!";
+while (<TF>) {
+	chop;
+	print OF $_ . "\n";
+}
+
+close(TF);
+close(OF);
+
+unlink($tmpFile);
+
diff --git a/bin/tr2crfpp.pl b/bin/tr2crfpp.pl
@@ -1,5 +1,9 @@
 #!/usr/bin/env perl
 # -*- cperl -*-
+#
+# Matthias Bösinger (29.03.2016)
+# -> all changes marked with: MB1
+
 =head1 NAME
 
 tr2crfpp.pl
@@ -22,7 +26,13 @@ =head1 HISTORY
 require 5.0;
 use Getopt::Std;
 use strict 'vars';
+use utf8;
+use 5.010; 	# MB1
 use FindBin;
+
+binmode STDIN, ":encoding(UTF-8)"; 	# MB1
+binmode STDOUT, ":encoding(UTF-8)"; 	# MB1
+
 # use diagnostics;
 
 ### USER customizable section
@@ -93,21 +103,29 @@ sub License {
 if ($filename = shift) {
  NEWFILE:
   if (!(-e $filename)) { die "# $progname crash\t\tFile \"$filename\" doesn't exist"; }
-  open (*IF, $filename) || die "# $progname crash\t\tCan't open \"$filename\"";
-  $fh = "IF";
+  open (IF, "<:encoding(UTF-8)", $filename) || die "# $progname crash\t\tCan't open \"$filename\""; 	# set encoding to UTF-8 - MB1
+# open (*IF, $filename) || die "# $progname crash\t\tCan't open \"$filename\"";
+# $fh = "IF";
 } else {
   $filename = "<STDIN>";
   $fh = "STDIN";
 }
 
-while (<$fh>) {
+# while (<$fh>) { 	# set encoding to UTF-8 - MB1
+while (<IF>) { 	# set encoding to UTF-8 - MB1
   if (/^\#/) { next; }			# skip comments
   elsif (/^\s+$/) { next; }		# skip blank lines
   else {
     my $tag = "";
     my @tokens = split(/ +/);
     my @feats = ();
-    my $hasPossibleEditor = (/(ed\.|editor|editors|eds\.)/) ? "possibleEditors" : "noEditors";
+
+    ###
+    # Regex updated accordingly to changes in Tr2crfpp.pm by Artemy Kolchinsky and new german editor strings in ConfigLang.pm
+    # MB1
+    ###
+    my $hasPossibleEditor = (/[^A-Za-z](ed\.?|editor|editors|eds\.?|Hrsg\.?|Herausgeber|Hg\.?|hgg\.?)/i) ? "possibleEditors" : "noEditors";
+
     my $j = 0;
     for (my $i = 0; $i <= $#tokens; $i++) {
 #    for (my $i = $#tokens; $i >= 0; $i--) {

diff --git a/lib/ParsCit/ConfigLang.pm b/lib/ParsCit/ConfigLang.pm
@@ -0,0 +1,63 @@
+package ParsCit::ConfigLang;
+
+################
+# Written By Matthias Bösinger (29.03.2016)
+# 
+# Modul is used to set language specific data fields.
+# Call of 'Init' with language type passed as parameter, will cause the initialization of the gloabl data fields.
+# hasEditor: Regex used in feature determination to decide if a reference contains editor tokens.
+# authorSplit: Regex used to split contiguous as author tags labeled tokens, into several author-names.
+# authorDelete: Regex used to delete parts of an as author tag labeled token.
+# inMarker: not in use in this version -> could be used for additional feature that marks a collective volume in the reference string
+################
+
+use utf8;
+
+## Global
+$hasEditorRegex = '';
+$authorSplitRegex = '';
+$authorDeleteRegex = '';
+$inMarker = '';
+
+## Language specific data
+my %enData = ( 	'editor' => '[^A-Za-z](ed\.?|editor|editors|eds\.?)',
+				'author' => '^(&|/|and|a\.)$',
+				'delete' => 'et\.? al\.?.*$',
+				'in' 	 => 'in' );
+
+my %deData = ( 	'editor' => '[^A-Za-z](Hrsg\.?|Herausgeber|Hg\.?|hgg\.?)',
+				'author' => '^(&|/|und|u\.)$',
+				'delete' => '(u\.a\..*|et\.? al\.?.*)$',
+				'in' 	 => 'in' );
+
+
+## initialization methods
+sub Init {
+	my ($lang) = @_;
+
+	if ($lang eq "en") {
+		initData(%enData);
+	}
+	elsif ($lang eq "de") {
+		initData(%deData);
+	}
+	#additional languages might be included here - MB
+	else {
+		return 0;
+	}
+
+	1;
+}
+
+
+sub initData {
+	my (%data) = @_;
+
+	$hasEditorRegex = $data{'editor'};
+	$authorSplitRegex = $data{'author'};
+	$authorDeleteRegex = $data{'delete'};
+	$inMarker = $data{'in'};
+
+}
+
+1;