Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
82 changes: 64 additions & 18 deletions bin/parseRefStrings.pl
Original file line number Diff line number Diff line change
Expand Up @@ -7,25 +7,53 @@
# Min-Yen Kan (Thu Feb 28 14:10:28 SGT 2008)
# Derived from citeExtract.pl
#
# Matthias Bösinger (29.03.2016)
# -> all changes marked with: MB1

use strict;
use FindBin;
use Getopt::Long;
use lib "$FindBin::Bin/../lib";

use lib "/home/wing.nus/tools/languages/programming/perl-5.10.0/lib/5.10.0";
use lib "/home/wing.nus/tools/languages/programming/perl-5.10.0/lib/site_perl/5.10.0";

use ParsCit::Controller;
use CSXUtil::SafeText qw(cleanAll cleanXML);
use ParsCit::ConfigLang;

###
# set standard encoding to UTF-8
# MB1
###
binmode STDIN, ":encoding(UTF-8)";
binmode STDOUT, ":encoding(UTF-8)";

### Get additional parameter (language parameter (english as default), split parameter, keep temp files parameter) - MB1
my $lang = "en";
my $split = '';
my $keep = '';
if (!GetOptions("lang=s" => \$lang, "split" => \$split, "keep" => \$keep)) {
print "Usage: $0 textfile outfile [-lang=en|de] [-split] [-keep]\n";
exit;
}
# initialize language config
if (!ParsCit::ConfigLang::Init($lang)) {
print "Usage: $0 textfile outfile [-lang=en|de] [-split] [-keep]\n";
exit;
}
### End (additional parameter) - MB1

my $textFile = $ARGV[0];
my $outFile = $ARGV[1];

if (!defined $textFile) {
print "Usage: $0 textfile [outfile]\n";
print "Usage: $0 textfile outfile [-lang=en|de] [-split] [-keep]\n"; # Updated - MB1
exit;
}

open (IF, $textFile) || die "Couldn't open text file \"textFile\"!";
# open (IF, $textFile) || die "Couldn't open text file \"textFile\"!";
open (IF, "<:utf8", $textFile) || die "Couldn't open text file \"textFile\"!"; # set to utf-8-encoding - MB1
my $normalizedCiteText = "";
my $line = 0;
while (<IF>) {
Expand All @@ -42,16 +70,16 @@
}

our $msg = "";
my $tmpFile = ParsCit::Tr2crfpp::PrepData(\$normalizedCiteText, $textFile);
my $outFile = $tmpFile."_dec";
my $tmpFile = ParsCit::Tr2crfpp::PrepData(\$normalizedCiteText, $textFile, $split); # Additional parameter 'split' - MB1
my $outTmpFile = $tmpFile."_dec"; # Changed name from '$outFile'. Otherwise name conflict with 'outFile' from $ARGV[1]; - MB1
my @validCitations = ();

my $xml = "";
$xml .= "<algorithm name=\"$ParsCit::Config::algorithmName\" version=\"$ParsCit::Config::algorithmVersion\">\n";
$xml .= "<citationList>\n";
if (ParsCit::Tr2crfpp::Decode($tmpFile, $outFile)) {
if (ParsCit::Tr2crfpp::Decode($tmpFile, $outTmpFile)) {
my ($rRawXML, $rCiteInfo, $tstatus, $tmsg) =
ParsCit::PostProcess::ReadAndNormalize($outFile);
ParsCit::PostProcess::ReadAndNormalize($outTmpFile);
if ($tstatus <= 0) {
return ($tstatus, $msg, undef, undef);
}
Expand All @@ -60,15 +88,16 @@
my %citeInfo = %{$citeInfo[$i]};
$xml .= "<citation>\n";
foreach my $key (keys %citeInfo) {
if ($key eq "authors" || $key eq "editors") {
my $singular = $key;
chop $singular;
$xml .= "<$key>\n";
foreach my $person (@{$citeInfo{$key}}) {
cleanAll(\$person);
$xml .= "<$singular>$person</$singular>\n";
}
$xml .= "</$key>\n";
if ($key eq "authors" || $key eq "editors")
{
my $singular = $key;
chop $singular;
$xml .= "<$key>\n";
foreach my $person (@{$citeInfo{$key}}) {
cleanAll(\$person);
$xml .= "<$singular>$person</$singular>\n";
}
$xml .= "</$key>\n";
}
elsif ($key eq "volume")
{
Expand Down Expand Up @@ -96,7 +125,24 @@
$xml .= "</citationList>\n</algorithm>\n";
}

unlink($tmpFile);
unlink($outFile);
###
# tmp-files are kept if parameter has been set.
# MB1
###
unless ($keep) {
unlink($tmpFile);
unlink($outTmpFile);
}

###
# If outFile has been passed as parameter the result .xml will be print to this file.
# Else the result .xml will be print to standard out.
# MB1
###
if (open(OUT, ">:utf8", $outFile)) {
print OUT $xml;
}
else {
print $xml;
}

print $xml;
58 changes: 58 additions & 0 deletions bin/prepData.pl
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
#!/usr/bin/perl -CSD
#
# Script to transform reference strings to crf++ compatible data.
# By doing so, CRF++ can be used manually with the trasnformed reference data.
#
# Derived from 'parseRefStrings.pl'
#
# written by Matthias Bösinger (29.03.2016)

use strict;
use FindBin;
use lib "$FindBin::Bin/../lib";

use lib "/home/wing.nus/tools/languages/programming/perl-5.10.0/lib/5.10.0";
use lib "/home/wing.nus/tools/languages/programming/perl-5.10.0/lib/site_perl/5.10.0";

use ParsCit::Controller;
use CSXUtil::SafeText qw(cleanAll cleanXML);

my $textFile = $ARGV[0];
my $outFile = $ARGV[1];

if (!defined $textFile || !defined $outFile) {
print "Usage: $0 textfile outfile\n";
exit;
}

open (IF, $textFile) || die "Couldn't open text file \"textFile\"!";
my $normalizedCiteText = "";
my $line = 0;
while (<IF>) {
chop;
# Tr2cfpp needs an enclosing tag for initial class seed.
$normalizedCiteText .= "<title> " . $_ . " </title>\n";
$line++;
}
close (IF);

if ($line == 0) {
# Stop - nothing left to do.
exit();
}

our $msg = "";
my $tmpFile = ParsCit::Tr2crfpp::PrepData(\$normalizedCiteText, $textFile);

open (TF, $tmpFile) || die "Couldn't open tmp file!";
open (OF, ">$outFile") || die "Couldn't open out file!";
while (<TF>) {
chop;
print OF $_ . "\n";
}

close(TF);
close(OF);

unlink($tmpFile);

26 changes: 22 additions & 4 deletions bin/tr2crfpp.pl
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
#!/usr/bin/env perl
# -*- cperl -*-
#
# Matthias Bösinger (29.03.2016)
# -> all changes marked with: MB1

=head1 NAME

tr2crfpp.pl
Expand All @@ -22,7 +26,13 @@ =head1 HISTORY
require 5.0;
use Getopt::Std;
use strict 'vars';
use utf8;
use 5.010; # MB1
use FindBin;

binmode STDIN, ":encoding(UTF-8)"; # MB1
binmode STDOUT, ":encoding(UTF-8)"; # MB1

# use diagnostics;

### USER customizable section
Expand Down Expand Up @@ -93,21 +103,29 @@ sub License {
if ($filename = shift) {
NEWFILE:
if (!(-e $filename)) { die "# $progname crash\t\tFile \"$filename\" doesn't exist"; }
open (*IF, $filename) || die "# $progname crash\t\tCan't open \"$filename\"";
$fh = "IF";
open (IF, "<:encoding(UTF-8)", $filename) || die "# $progname crash\t\tCan't open \"$filename\""; # set encoding to UTF-8 - MB1
# open (*IF, $filename) || die "# $progname crash\t\tCan't open \"$filename\"";
# $fh = "IF";
} else {
$filename = "<STDIN>";
$fh = "STDIN";
}

while (<$fh>) {
# while (<$fh>) { # set encoding to UTF-8 - MB1
while (<IF>) { # set encoding to UTF-8 - MB1
if (/^\#/) { next; } # skip comments
elsif (/^\s+$/) { next; } # skip blank lines
else {
my $tag = "";
my @tokens = split(/ +/);
my @feats = ();
my $hasPossibleEditor = (/(ed\.|editor|editors|eds\.)/) ? "possibleEditors" : "noEditors";

###
# Regex updated accordingly to changes in Tr2crfpp.pm by Artemy Kolchinsky and new german editor strings in ConfigLang.pm
# MB1
###
my $hasPossibleEditor = (/[^A-Za-z](ed\.?|editor|editors|eds\.?|Hrsg\.?|Herausgeber|Hg\.?|hgg\.?)/i) ? "possibleEditors" : "noEditors";

my $j = 0;
for (my $i = 0; $i <= $#tokens; $i++) {
# for (my $i = $#tokens; $i >= 0; $i--) {
Expand Down
63 changes: 63 additions & 0 deletions lib/ParsCit/ConfigLang.pm
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
package ParsCit::ConfigLang;

################
# Written By Matthias Bösinger (29.03.2016)
#
# Modul is used to set language specific data fields.
# Call of 'Init' with language type passed as parameter, will cause the initialization of the gloabl data fields.
# hasEditor: Regex used in feature determination to decide if a reference contains editor tokens.
# authorSplit: Regex used to split contiguous as author tags labeled tokens, into several author-names.
# authorDelete: Regex used to delete parts of an as author tag labeled token.
# inMarker: not in use in this version -> could be used for additional feature that marks a collective volume in the reference string
################

use utf8;

## Global
$hasEditorRegex = '';
$authorSplitRegex = '';
$authorDeleteRegex = '';
$inMarker = '';

## Language specific data
my %enData = ( 'editor' => '[^A-Za-z](ed\.?|editor|editors|eds\.?)',
'author' => '^(&|/|and|a\.)$',
'delete' => 'et\.? al\.?.*$',
'in' => 'in' );

my %deData = ( 'editor' => '[^A-Za-z](Hrsg\.?|Herausgeber|Hg\.?|hgg\.?)',
'author' => '^(&|/|und|u\.)$',
'delete' => '(u\.a\..*|et\.? al\.?.*)$',
'in' => 'in' );


## initialization methods
sub Init {
my ($lang) = @_;

if ($lang eq "en") {
initData(%enData);
}
elsif ($lang eq "de") {
initData(%deData);
}
#additional languages might be included here - MB
else {
return 0;
}

1;
}


sub initData {
my (%data) = @_;

$hasEditorRegex = $data{'editor'};
$authorSplitRegex = $data{'author'};
$authorDeleteRegex = $data{'delete'};
$inMarker = $data{'in'};

}

1;
Loading