#!/usr/bin/perl

####################################################################
#
# File: bibweb
# Author: John H. Palmieri <palmieri@math.washington.edu>
#         URL: http://www.math.washington.edu/~palmieri/Bibweb/
# Version: 0.49 of Wed Nov 19 12:58:29 PST 2003
# Description: retrieve bibliographical information from MathSciNet
#              automatically
# Copyright (c) 1997, 1998, 1999, 2000, 2001, 2002, 2003 John H. Palmieri
# License: distributed under GNU General Public License -- see below.
#
####################################################################
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# (see file COPYING) along with this program; if not, write to the
# Free Software Foundation, Inc., 59 Temple Place - Suite 330,
# Boston, MA  02111-1307, USA.
#
####################################################################
#
# Command line options:
#    FILE        use FILE.aux as input, FILE.bib as output
#    -i FILE     specify FILE as input (aux) file.
#    -o FILE     specify FILE as output (bib) file.  If FILE ends in
#                 ".bib", write to FILE; otherwise, write to FILE.bib
#    -c CITATION looks up CITATION, rather than using an auxfile for input
#    -b          get output in bibtex format (default)
#    -r          insert the review in the bibtex output
#    -d          get output in dvi format (written to MR#.dvi, where
#                 'MR#' is the Math Reviews number of the reference)
#    -p          get output in postscript format (written to MR#.ps)
#    -pdf        get output in pdf format (written to MR#.pdf)
#    -t          get output in text format (written to MR#.txt)
#    -m NUM      return at most NUM entries (where NUM is rounded up
#                 to 5, 10, 20, 50, 100, 1000)
#    -e WEB_SITE  use WEB_SITE for MathSciNet search
#    -L          use lynx instead of wget (note: lynx is slower)
#    -h          print brief help message
#
# If you use only one of the -i and -o options, bibweb makes a guess
# as to what the other file should be.
#
# At the moment, -d, -p, -pdf, -t only work if you have lynx available.
####################################################################

$bibtex = 'bibtex';
$thisprog = 'bibweb';
$version = '0.49';
# good choices for e_math: 'www.ams.org', 'ams.rice.edu',
#   'ams.mathematik.uni-bielefeld.de', 'ams.mpim-bonn.mpg.de',
#   'ams.u-strasbg.fr', 'ams.impa.br'
$e_math = $ENV{MATHSCINET_SITE};
unless ($e_math) {$e_math = 'www.ams.org'}
# % is not the official BibTeX comment character yet, so you might want
# to change these to be absolutely compatible.  Of course, I forget what
# the official way is to do comments; maybe @comment?
$bibtex_short_comment = '%%';
$bsc = $bibtex_short_comment;
$bibtex_long_comment = '%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%';
#
$use_stdout = 0;
$usage = <<EOT;
This is $thisprog, version $version.
To run bibtex on FILE, and then scan MathSciNet for missing
  bibliographical information: $0 [options] FILE
  options can be:
    -i FILE     use \'FILE\' as input (aux) file
    -o FILE     use \'FILE\' as output (bib) file
    -c REF      looks up single citation \'REF\', instead of using
                 an auxfile for input
    -b          get output in bibtex format (default)
    -r          insert the review in the bibtex output
    -d          get output in dvi format (written to MR\#.dvi, where
                 \'MR\#\' is the Math Reviews number of the reference)
    -p          get output in postscript format (written to MR\#.ps)
    -pdf        get output in pdf format (written to MR\#.pdf)
    -t          get output in text format (written to MR\#.txt)
    -m NUM      return at most NUM entries (rounded up)
    -e WEB_SITE  use WEB_SITE for MathSciNet search
    -L          use lynx instead of wget (note: lynx is slower)
    -std        write output to STDOUT (the screen, ordinarily)
    -sep CHAR   use CHAR to delimit fields in long citation format,
                instead of semicolon (;)
    -h          print this help message
EOT

# read command line arguments
use Getopt::Long;
GetOptions("input|i=s" => \$auxfile,
	   "output|o=s" => \$bibfile,
	   "stdout|std|s" => \$use_stdout,
	   "cite|c=s" => \$only_cite,
	   "bibtex|bib|b" => \$use_bibtex,
	   "review|rev|r" => \$review,
	   "dvi|d" => \$use_dvi,
	   "postscript|ps|p" => \$use_postscript,
	   "pdf" => \$use_pdf,
	   "text|txt|t" => \$use_text,
	   "max|m=i" => \$user_max,
	   "emath|e=s" => \$e_math,
	   "lynx|Lynx|L" => \$use_lynx,
	   "separator|sep=s" => \$new_separator,
	   "help|h" => \$help) or
    die "$usage\n";
if ($help) { die "$usage\n" };

OUTPUTTYPE: {
    if ($use_bibtex) {
	$output_type = 'bibtex';
	$getitem_return = 'bibtex';
	last OUTPUTTYPE;
    };
    if ($use_dvi) {
	$output_type = 'dvi';
	$use_lynx = 1;
	$getitem_return = 'dvi';
	last OUTPUTTYPE;
    };
    if ($use_postscript) {
	$output_type = 'postscript';
	$use_lynx = 1;
	$getitem_return = 'ps';
	last OUTPUTTYPE;
    };
    if ($use_pdf) {
	$output_type = 'pdf';
	$use_lynx = 1;
	$getitem_return = 'pdf';
	last OUTPUTTYPE;
    };
    if ($use_text) {
	$output_type = 'text';
	$use_lynx = 1;
	$getitem_return = 'hl';
	last OUTPUTTYPE;
    };
    $use_bibtex = 1;
    $output_type = 'bibtex';
    $getitem_return = 'bibtex';
}

chomp($which_wget = `which wget`);
chomp($which_lynx = `which lynx`);
chomp($which_webget = `which webget`);

if (!$use_lynx and (-x $which_wget)) {
    $wget = 'wget';
    $wget_switches = ' --quiet -O - ';
}
elsif (!$use_lynx and (-x $which_webget)) {
    $wget = 'webget';
    $wget_switches = '';
}
elsif (-x $which_lynx) {
    $wget = 'lynx';
    $wget_switches = ' -source ';
}
else {
    die "I can't find wget, lynx, or webget.  Your path may not be set
correctly, or you may need to install one of these programs.  Lynx
must be available in order to use the -dvi, -ps, -pdf, and -txt options.\n"
}

if ($e_math eq 'ams') {$e_math = 'www.ams.org'}
if ($e_math eq 'rice') {$e_math = 'ams.rice.edu'}
if ($e_math eq 'bielefeld') {$e_math = 'ams.mathematik.uni-bielefeld.de'}
if ($e_math eq 'bonn') {$e_math = 'ams.mpim-bonn.mpg.de'}
if ($e_math eq 'strasbg') {$e_math = 'ams.u-strasbg.fr'}
if ($e_math eq 'impa') {$e_math = 'ams.impa.br'}

if ($e_math =~ /
	^www.ams.org$|
	^ams.rice.edu$|
	^ams.mathematik.uni-bielefeld.de$|
	^ams.mpim-bonn.mpg.de$|
	^ams.u-strasbg.fr$|
        ^ams.impa.br$
	/x) {
    print "Using \`$e_math\' for the MathSciNet search.\n"
    }
    else {
	print "Warning: Your choice of \`$e_math\' for the MathSciNet site
is not one of the recommended choices.  Proceeding anyway...\n\n";
}

if ($e_math eq 'www.ams.org') {$redirect = 'Providence%2C+RI+USA'}
if ($e_math eq 'ams.rice.edu') {$redirect = 'Houston%2C+TX+USA'}
if ($e_math eq 'ams.mathematik.uni-bielefeld.de') {
    $redirect = 'Bielefeld%2C+Germany'}
if ($e_math eq 'ams.mpim-bonn.mpg.de') {$redirect = 'Bonn%2C+Germany'}
if ($e_math eq 'ams.u-strasbg.fr') {$redirect = 'Strasbourg%2C+France'}
if ($e_math eq 'ams.impa.br') {$redirect = 'Rio+de+Janeiro%2C+Brazil'}

if ($user_max) {
  MAX: {
      if ($user_max <= 5) { $max_matches = 5; last MAX; };
      if ($user_max <= 10) { $max_matches = 10; last MAX; };
      if ($user_max <= 20) { $max_matches = 20; last MAX; };
      if ($user_max <= 50) { $max_matches = 50; last MAX; };
      if ($user_max <= 100) { $max_matches = 100; last MAX; };
      if ($user_max <= 1000) { $max_matches = 1000; last MAX; };
      $max_matches = 5;
  };
    if ($user_max != $max_matches) {
	print "Rounding -m argument up to $max_matches.\n\n";
    };
}
else {
    $max_matches = 5;
}

unless ($auxfile and $bibfile) {
    $file = shift(@ARGV);
    $auxfile = ($auxfile or $file);
    $bibfile = ($bibfile or "$file.bib");
}

if ($bibfile and $bibfile !~ m/\.bib$/) {
    $bibfile = "$bibfile.bib";
}

if ($only_cite) {
    $only_cite =~ s(\'\")();
    @bibtex_output = (1);
    $use_stdout = 1 if ($bibfile eq "" or $bibfile eq ".bib");
}
else {
    $auxfile =~ s/\.aux$//;
    if ($auxfile and not $bibfile) { $bibfile = "$auxfile.bib" }
    unless (-e "$auxfile.aux") {die "Couldn't read $auxfile.aux\n"};
    unless ($use_stdout) {
	open(BIBFILE, ">>$bibfile") or die "Couldn't open $bibfile\n";
	print "Appending output to $bibfile . . . \n\n";
    }
    @bibtex_output = `$bibtex $auxfile`;
}

$semicolon = ";";
if ($new_separator) { $semicolon = quotemeta ($new_separator) }


foreach $warning (@bibtex_output) {
    # get citation
    if ($only_cite) {
	$citation_orig = $citation = $only_cite;
	$citation =~ tr/,/./;
    }
    elsif ($warning =~ (/^Warning--I didn\'t find a database entry for \"([^\"]*)\"$/)) {
	$citation_orig = $citation = $1;
	next if &check_bibfile($citation_orig);
    }
    else { next; }


    # split citation into author, etc.
    $citation =~ s/-and-/-/g;
    $citation =~ tr/./,/;
    $author = '';
    $title = '';
    $journal = '';
    $year = '';
    $dr = 'all';
    $misc = '';
    
    if ($citation =~ m/$semicolon/) {
	($authors, $titles, $journals, $year) = split(/$semicolon/, $citation);
	$author = join('+and+', split(/-/, $authors));
	$title = join('+and+', split(/-/, $titles));
	$journal = join('+and+', split(/-/, $journals));
    }
    else {
	($author, $subcitation) = split(/-/, $citation, 2);
	if ($subcitation =~ /([<>=]?\d+)\Z/) {
	    $subcitation = $`;
	    $year = $1; }
	$misc = join('+and+', split(/-/, $subcitation));
    }

    # parse year entry
  YEAR: {
      if ($year eq '') {
	  $dr = 'all';
	  last YEAR;
      };
      if ('<' eq substr($year, 0, 1)) {
	  $ord = 'lt';
	  $yr = substr($year, 1);
	  $dr = 'pubyear';
	  last YEAR; 
      };
      if ('=' eq substr($year, 0, 1)) {
	  $ord = 'eq';
	  $yr = substr($year, 1);
	  $dr = 'pubyear';
	  last YEAR; 
      };
      if ('>' eq substr($year, 0, 1)) {
	  $ord = 'gt';
	  $yr = substr($year, 1);
	  $dr = 'pubyear';
	  last YEAR; 
      };
      $ord = 'eq';
      $yr = $year;
      $dr = 'pubyear';
  }

    # construct URL
    $wget_url =
	"\'http://$e_math/msnmain?fn=130&form=fullsearch&preferred_language-en&Submit=Start+Search&" .
	"pg4=AUCN&s4=$author&co4=AND" .
	"&pg5=TI&s5=$title&co5=AND" .
	"&pg6=JOUR&s6=$journal&co6=AND" .
	"&pg7=ALLF&s7=$misc" .
        "&dr=$dr" .
	"&yrop=$ord&arg3=$yr" .
	"&yearRangeFirst=&yearRangeSecond=" .
	"&pg3=ET&s3=All&l=20&reference_lists=show&redirect=$redirect\'";
    unless ($use_stdout) {
	&bib_print("", "$bibtex_long_comment \n");
    }
    &bib_print("working on citation \'$citation_orig\' \n", 
		"$bsc   citation \'$citation_orig\' \n" );
    $match_info_printed = '';
    $done_getting_bibtex = '';

    # print $wget_url;

    # call wget and process its output
    foreach $line (`$wget $wget_url $wget_switches`) {
      # print $line;

      # get number of matches
	if (($line =~ (/Number of Matches:.*<\/b> ([0-9]*)/))
	  or ($line =~ (/Item:.*<strong>([0-9]*)<\/strong>\s*$/))) {
	  # print $line;

	    $matches = $1;
	    # print $matches;

	    if ($matches > $max_matches) {
		&bib_print("More than $max_matches matches found " .
			   "($matches); " .
			   "please refine your search criteria,\n" .
			   "or use the -m option. \n\n",
			   "$bsc More than $max_matches matches found " .
			   "($matches). \n$bsc\n");
		last;
	    }
	    elsif ( not $match_info_printed ) {
		$plural = "es";
		$match_info_printed = 1;
		if ($matches == 1) {$plural = ""}
		&bib_print("$matches match$plural found. \n\n", "" );
		}
	  }

	# given match, get MR number, convert to forms for use in URL
	#elsif ($line =~ (/TYPE=\"checkbox\" VALUE=\"([0-9][0-9_a-z]*)\"/)) {
	elsif ((($matches == 1) and
		($line =~ (/<strong>MR([^<]*)<\/strong>\s*<strong>\(([^)]*)\)<\/strong>/)) and
		(not $done_getting_bibtex))
	       or
	       (($matches > 1) and 
		($line =~ (/<strong>MR([^<]*)<\/strong>\s*<strong>\(([^)]*)\)<\/strong>/)))) {

	    #print $line;

	    if ($matches == 1) { $done_getting_bibtex = 1 };

	    # there are now new MR numbers: a unique 7 digit number
	    # assigned to each item in the database.  Assign this to $mr,
	    # and use it to search.  Assign old MR number to $old_mr, and
	    # use it for printing.
	    $mr = $1;
	    $old_mr = $2;
	    &bib_print("", "$bsc Math Reviews number: $old_mr \n");
	    $old_mr =~ tr/ //d;
	    $old_mr =~ tr/_/:/;
	    $old_mr =~ tr/\#/:/;
	    $old_mr =~ tr/,/:/;

	    # print $old_mr;

	    $new_wget_url = "\'http://$e_math/mathscinet-getitem?mr=$mr&return=$getitem_return\'";

	    # bibtex output
	    
	    if ($output_type eq 'bibtex') {

		# call wget again, process its output
		$at_sign_found = 0;
		
		foreach $line2 (`$wget $new_wget_url $wget_switches`) {

		    # look for @ at start of bibtex
		    if (not $at_sign_found) {
			$at_sign_found = ($line2 =~ /@/);
		    }

		    if ($at_sign_found) {
			# remove html tags from text and print it (if not blank)
			$line2 = &remove_html($line2);
			unless (($line2 =~ (/^\s*$/)) or ($line2 =~ "MathSciNet")) {
			    $mr =~ s/CMP//;

			    if ($matches == 1 and $line2 =~ (/$old_mr/)) {
				$line2 =~ s/MR$old_mr/$citation_orig/;
			    }
			    if ($line2 =~ "(c).*[0-9]*.*American Mathematical Society") {
			      # &bib_print("", "$bsc $line2\n$bsc\n");
				&bib_print ("", "\n");
				}
			    else {
			      &bib_print("", "$line2" );
			      if ($review && $line2 =~ "MRREV") {
				&print_review($mr);
			      }
			    }
			  }
		      }
		  }
	      }

	    # dvi, ps, pdf, txt output
	    else {
		$wget_switches = " -dump > MR$mr.$getitem_return";
		system ("$wget $new_wget_url $wget_switches");
		print "Output written to MR$mr.$getitem_return\n";
	    }
	}

	# warning messages...
	elsif ($line =~ (/Sorry, no matches found/)) {
	    &bib_print("No matches found; please revise your search criteria.  \n\n",
			"$bsc No matches found. \n$bsc\n");
	    last;
	}
	elsif ($line =~ (/Sorry, /)) {
	    &bib_print("There was an unknown error.  " .
		       "Please check for typos, or just try again.\n\n",
		       "$bsc Syntax or server error. \n$bsc\n");
	}
    }
}

close(BIBFILE);

# run bibtex again, to make use of new keys
exec "$bibtex $auxfile" unless ($only_cite);

########## simple subroutines

# scan BIBFILE for citation, to see if you've looked for it before
sub check_bibfile {
    local($answer) = 0;
    local($cite) = $_[0];
    if ($] >= 5) { $cite = "\Q$_[0]\E"; }
    if (-e $bibfile) {
	open(BIBFILE, "$bibfile");
	while (<BIBFILE>) {
	    if (/$bsc\s*citation\s*\'$cite\'/) {
		$answer = 1;
		last;
	    }
	}
	close(BIBFILE);
	if ($answer) { print "You've searched for $_[0] before.\n\n" };
	$answer;
    }
    else {
	0;
    }
}

# remove html tags (strings like <blah> and </blah>)
sub remove_html {
    local($text) = $_[0];
    $text =~ s/<\/[^\>]*\>/ /g;
    $text =~ s/<[^\>]*\>//g;
    $text;
}

# print to screen and to BIBFILE, unless writing to STDOUT
sub bib_print {
    local($line1, $line2) = ($_[0], $_[1]);
    if ($use_stdout) {
	if ($use_bibtex) {print $line2}
	else {print $line1}
    }
    else {
	open(BIBFILE, ">>$bibfile") or die "Couldn't open $bibfile\n";
	print $line1;
	print BIBFILE "$line2";
    }
}

# print the review of the current $mr item.
sub print_review {
    local($mr) = $_[0];

    local($command) = "$wget \'http://$e_math/mathscinet-getitem?mr=$mr\' $wget_switches";
    local($count) = 0;
    local($line);
    &bib_print ("", "    REVIEW = {");

    foreach $line (`$command`) {
      # print $line;
      if ($line =~ "<STRONG>Reviewed</STRONG>")
	{ $count++;
	  chomp($line);
	  &bib_print ("", &remove_html($line) . " },\n"); }
      if ($count == 2) { &bib_print ("", &remove_html($line)) }
      if ($line =~ "citation to clipboard") { $count++ }
      if ($line =~ "<BR CLEAR=RIGHT>") { $count++ }
    }
  }
