The script for the previous.
karttu at megabaud.fi
karttu at megabaud.fi
Wed Jan 2 23:55:25 CET 2002
listAs.pl follows:
#!/usr/bin/perl
#
# listAs.ps -- An Ad-Hoc parsing script for OEIS %A -lines.
# fetched from http://www.research.att.com/~njas/sequences/eisBTfry000$a.txt
# where $a ranges from 00 to some number (say 99).
# Written 2. Jan 2002 by karttu
#
#
# Usage:
# cat eis*.txt | perl listAs.pl | sort | uniq -c | sort -nr > authcounts.txt &
#
# The %A lines that interest us have various formats:
#
# %A A034825 njas
# Clark Kimberling, ck6 at cedar.evansville.edu
# Clark Kimberling (ck6 at cedar.evansville.edu)
# encyclopedia at pommard.inria.fr, Jan 25 2000
# Patrick De Geest (pdg at worldofnumbers.com), Jun 1998.
# njas, Robert G. Wilson v (rgwv at kspaint.com)
# Robert G. Wilson v (rgwv at kspaint.com), Aug 10 2001
# rkg at cpsc.ucalgary.ca (Richard Guy)
# njas,jhc
# njas, mb, Robert G. Wilson v (rgwv at kspaint.com)
# Antti.Karttunen at iki.fi Oct 28 200
# Antti.Karttunen at iki.fi (karttu at megabaud.fi) Sep 03 2000
# Antti Karttunen (karttu at megabaud.fi) and Patrick De Geest (pdg at worldofnumbers.com), Nov 1999.
# jhc [ conway at math.Princeton.EDU ]
# Jan Kristian Haugland (jankrihau at hotmail.com)
# Jan.Hagberg at stat.su.se
# mlb at well.com (Marc Le Brun)
# Jud McCranie and Carlos Rivera (jud.mccranie at mindspring.com)
#
# and many more patological cases...
#
sub trim_blankos_and_trash
{
my($convstring) = @_;
for($convstring)
{
s/^.* by //o; # Submitted by, Suggested by, Sent by, etc.
s/^by //oi; # Bysomebody
s/^Including //oi; #
s/^["\s]+//o;
s/[\]\)\s"\.]+$//o;
}
return($convstring);
}
sub scan_input
{
my($file) = @_;
my($adr1);
while($_ = <$file>)
{
if(/^\%A [^ ]* /o)
{
my $rest = $';
chop($rest);
# Submitters separated by commas, semicolons and word " and ":
my @cs = split(/\s*,\s*|\s*\;\s*|\s* and \s*/,$rest);
foreach $adr1 (@cs)
{
my @words;
my $word;
if(($adr1 !~ /^(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec) [0-8]/oi)
&&
($adr1 !~ /^[0-9]*$/o))
{
if($adr1 =~ /^([^[(<]*)[[(<]\s*([^]>)]*)/o) # Name followed by e-mail address in parentheses? (Or the other way)
{
my $first = trim_blankos_and_trash($1);
my $second = trim_blankos_and_trash($2);
if(($first =~ /@/o) && ($second !~ /@/o)) # Only the first part contains mail address? like: mlb at well.com (Marc Le Brun)
{
$adr1 = $second;
}
else # Presumably it's the second part in parenthesis which is an e-mail.
{
$adr1 = $first;
}
# print "The first='$first', the second='$second'\n";
}
else { $adr1 = trim_blankos_and_trash($adr1); }
if($adr1 =~ /^([^\s]*) (Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec) [0-8]/oi) # Reaping my own stupidity.
{
$adr1 = $1;
}
if($adr1 =~/^([^@]*)@/o) # An email is the best we have...
{
my $convstring = $first = $1;
for($convstring) { s/\./ /o; }
if($first ne $convstring) { $adr1 = $convstring; }
}
if($adr1 =~ /^[0-9]+/o) { next; } # Pieces of dates...
my @words = split(/\s+/,$adr1);
foreach $word (@words) { print ucfirst(lc($word)); print ' '; }
print "\n";
}
# else { print "Ignoring date: ",$adr1,"\n"; }
}
}
}
print "\n";
}
&scan_input(*STDIN);
-----------------------------------------------------------------------
And the C-shell script for downloading the whole database from OEIS:
#!/bin/csh
if ("a" == "$1a") then
echo "Usage: $0 outputfilename (takes more than thirty megabytes)"
exit 1
endif
set a=0
luuppi:
if ($a < 10) set a=0$a
echo "Dumping http://www.research.att.com/~njas/sequences/eisBTfry000$a.txt"
lynx -source http://www.research.att.com/~njas/sequences/eisBTfry000$a.txt > eis$a.txt
if (`fgrep -c 'Document Not Found' eis$a.txt` > 0) then
echo "/~njas/sequences/eisBTfry000$a.txt not found anymore, finishing."
rm -f eis$a.txt
lynx -source http://www.research.att.com/~njas/sequences/recent.txt > recent.txt
exit 0
endif
set a=`expr $a + 1`
goto luuppi
More information about the SeqFan
mailing list