#!/usr/bin/perl -w
#split_definitions.pl
# 2003 september 17
# This was written for 1 specific file. Definitions_of_Characters_and_2.txt
# Just so we wouldn't have to retype everything in by hand.
# Things to do:
# - Fix the date stamp
# - remove the accession number.
# - put the multiple definitions in the same file
# - create a webpage describing the input format and the output
# - make it so the xml tags come from the current xml
# - character and character group files
# - when there are multiple definitions, do we want separate files?
# --- like asymmetric -- (flowers, cones)
# --- like axillary -- (axillary_flowers, axillary_cones, buds)
# - files to check: (ones that would show problems) asymmetric, axillary,
my @lines = <>;
@lines = &clean_array(@lines); # takes care of special characters
my @char_list = ();
my @char_sibs = ();
my @state_list = ();
my @state_files = ();
my @char_xml = ();
my @state_xml = ();
my @above = ();
my @below = ();
my @charGroup_xml = ();
my @char_files = ();
my @charGroup_files = ();
my %file_names;
my %dir_names = ('character' => 'character',
'charactergroup' => 'characterGroup',
'state' => 'state');
my %source;
my %s_definition;
%dtds = ('character' => 'http://www.isrl.uiuc.edu/~openkey/shared/character.dtd',
'state' => 'http://www.isrl.uiuc.edu/~openkey/shared/state.dtd',
'charactergroup' => 'http://www.isrl.uiuc.edu/~openkey/shared/characterGroup.dtd');
#my $state_dir = '/home/openkey/public_html/prairiepalnt/state/xml/';
#my $char_dir = '/home/openkey/public_html/SEastTree/character/xml/';
#my $charGroup_dir = '/home/openkey/public_html/SEastTree/characterGroup/xml/';
my $project = "plant";
$project = join_name($project);
my $out2 = 'troubles'; # when there are troubles, they will print to a $out2 file
my $iteration = 0; # for sub check_duplicates
my $num_out = 0; # keeps track of accession number
for($i = 0; $i < @lines; $i++){
chomp($lines[$i]);
if($lines[$i] =~ /\w+/){
if($lines[$i] =~ /(.*)\s-\s(.*)/){
# It is a state with a definition
@c_some = &handle_state($1,$2);
}elsif($lines[$i] =~ /(.*)\t(.*)/){
# ($1)\t($2)
# It is probably a reference or a page footer
if($1 =~ /\s/){
# It is the page footer, and not really a reference
# Example: Definitions_of_Characters_and_2.doc, 03/27/03 7:03 PM Page 1 of 1
}else{
# It really is a reference like this example:
# RDMB Radford, Albert E., William C. Dickison, Jimmy R. Massey, and C. Ritchie Bell.
# 1974.
# Vascular Plant Systematics.
# Harper & Row Publishers: New York, Evanston, San Francisco, London.
$source{$1} = $2;
##print "Source abbreviation: $1 means: $2\n";
}# end if
}else{
if($lines[$i-1]){
if($lines[$i-1] =~ /\w/){
#not sure what this is, it is not a character
}else{
# This is a character
if($lines[$i-2]){
if($lines[$i-2] =~ /\w/){
if($lines[$i-2] =~ /(.*)\s-\s(.*)/){ # if the last line was a state
my $char = &handle_char();
push(@char_sibs,pop(@char_list));
#pop(@char_list);
@state_list = ();
&xml2file('Character',$char,@char_xml);
}# end if
}else{
if(@state_list >= 1){
my $char = &handle_char();
&xml2file('Character',$char,@char_xml);
push(@char_sibs,pop(@char_list));
my $charGroup = &handle_charGroup();
&xml2file('characterGroup',$charGroup,@charGroup_xml);
@state_list = ();
}else{
print "Hey, no states! line was $lines[$i].\n";
}# end if
@char_list = (); # a double line feed (between major charactergroups)
@char_sibs = ();
@state_list = ();
}# end if
if($lines[$i] =~ /^See (.*)/){ # implies something common like pubescent
my $state2 = $1;
if($state2 =~ /below/i){
my $term = $state2;
$term =~ s/See\s//ig;
$term =~ s/\sbelow.*//ig;
$term =~ s/\s+/\s/g; #replace multiple spaces with a single space.
my @terms = split(/and/,$term);
push(@below, $char_list[-1]);
}elsif($state2 =~ /above/i){
my $term = $state2;
$term =~ s/See\s//ig;
$term =~ s/\sabove.*//ig;
$term =~ s/\s+/\s/g; #replace multiple spaces with a single space.
my @terms = split(/and/,$term);
push(@above, $char_list[-1]);
}#
$state2 = &join_name($state2);
my $out2_file = &handle_char($state2,$lines[$i]);
my $char = &handle_char();
push(@char_sibs,pop(@char_list));
@state_list = ();
&xml2file('Character',$char,@char_xml);
}else{
my $c = &new_character($lines[$i]);
}# end if
}# end if
}# end if
}# end if
}# end if
}# end if
}# end for
#&list_states();
sub handle_char{
my @list1 = @char_list;
my @list2 = @state_list;
my $char = pop(@list1);
$char = &join_name($char);
my $un_char = &unjoin($char);
my $char_file = &get_out_file_name($char,'Character');
my $item = '';
@char_xml = ('');
$item =' ' . $un_char . '';
push(@char_xml, $item);
for ($st=0; $st<@list2; $st++){
my $state = $list2[$st];
$state = &join_name($state);
my $un_state = &unjoin($state);
my $long_state = $state . '_' . &join_name($char_list[0]);
my $state_file = '';
if($file_names{$long_state}){
$state_file = $file_names{$long_state};
}elsif($file_names{$state}){
$state_file = $file_names{$state};
}else{
$state_file = get_out_file_name($out2,'State');
}# end if
$item = '' . $un_state . '';
push(@char_xml,$item);
}# end foreach
$item = ' none yet';
push(@char_xml,$item);
$item = ' ' . $un_char . '';
push(@char_xml,$item);
push(@char_xml,' ');
push(@char_xml,' ');
push(@char_xml,' ');
#for ($s=1; $s<@list1 -1; $s++){
# $item = ' ' . $list1[$s] . '';
# push(@char_xml,$item);
#}# end foreach
$item = ' ' . '';
push(@char_xml,$item);
push(@char_xml,' ');
$item = ' ' . '';
push(@char_xml,$item);
push(@char_xml,' ,');
push(@char_xml,'');
return $char;
}# end sub
sub handle_charGroup{
my @list1 = @char_list;
my @list2 = @char_sibs;
my $charGroup = $list1[0];
$charGroup = &join_name($charGroup);
my $un_charGroup = &unjoin($charGroup);
my $charGroup_file = &get_out_file_name($charGroup,'characterGroup');
my $item = '';
@charGroup_xml = ('');
$item =' ' . $un_charGroup . '';
push(@charGroup_xml, $item);
for ($st=0; $st<@list2; $st++){
my $char = $list2[$st];
$char = &join_name($char);
my $un_char = &unjoin($char);
my $char_file = $file_names{$char};
$item = '' . $un_char . '';
push(@charGroup_xml,$item);
}# end foreach
$item = ' none yet';
push(@charGroup_xml,$item);
$item = ' ' . $un_charGroup . '';
push(@charGroup_xml,$item);
push(@charGroup_xml,' ');
push(@charGroup_xml,' ');
push(@charGroup_xml,' ');
#for ($s=1; $s<@list1 -1; $s++){
# $item = ' ' . $list1[$s] . '';
# push(@charGroup_xml,$item);
#}# end foreach
$item = ' ' . '';
push(@charGroup_xml,$item);
push(@charGroup_xml,' <strong>');
$item = ' ' . $un_charGroup . '';
push(@charGroup_xml,$item);
push(@charGroup_xml,' </strong>: ');
push(@charGroup_xml,'');
return $charGroup;
}# end sub
sub unjoin {
my $name = shift;
#chomp($name);
$name =~ s/_/ /g;
return $name;
}# end sub unjoin
#my @related = &break_related($related);
sub break_related{
my $term = shift;
my @list = &break_commas($term);
my $count = @list;
my @array;
for($c = 0; $c < $count; $c++){
my $item = &join_name($list[$c]);
#push(@array,' ');
#push(@array,''.$item .'');
push(@array,$item);
}# end for
return @array;
}# end sub
#---------------------------------
#Example problems with ands and ,
#see amorphic and haplomorphic
#See Calyx Symmetry and Corolla Symmetry above
#(see schiz. mericarp, schiz. nutlets & schiz. samaras)
##Seed Cone Shape Before Opening and Seed Cone Shape After Opening
#(see illustrations for P. taeda, P. palustris and P. resinosa cones in K and elsewhere)
#(see corm, rhizome & tiller)
#(see both involute and rolled)
#(see sterigma, not very good)
#(see also woolly)
#(see rhombic [poor example] and quadrate)
#(see particular species for examples)
#(see achene & berry)
#(Also see leaf division.)
#(Also see leaf complexity.)
#Pubescence Type of Herbaceous Stems or Young Twigs (<= 1 year old)
#See Pubescence below.
#Other Surface Features of Herbaceous Stems or Young Twigs (<= 1 year old)
#See Other Surface Features below.
#Pubescence of 2-4-year-old Twigs
#See Pubescence below.
#Other Surface Features of 2-4-year-old Twigs
#(see both crescent-shaped and broadly crescent)
#(see Taxus baccata & Larix gemelini var. japonica)
#Leaf Upper Surface Pubescence and Leaf Lower Surface Pubescence
#See Pubescence below.
#Leaf Upper Surface Pubescence Type and Leaf Lower Surface Pubescence Type
#See Pubescence below.
#Other Features of Leaf Upper Surface and Other Features of Leaf Lower Surface
#See Other Surface Features below.
#Leaf Venation and Leaflet Venation
#(see pinnately netted, penni-parallel and reticulate)
#---------------------------------
#my @synonyms = &handle_multiples($synonym);
sub handle_multiples{
my $string = shift;
my @stop_list = ('see', 'below', 'above', 'both', 'not very good', 'poor example', 'also');
my $string =~ s/\sand\s/,/ig;
my $string =~ s/\s\&\s/,/g;
foreach $stop(@stop_list){
my $string =~ s/$stop//ig;
}# end foreach
my @terms = split(/,/,$string);
foreach $term(@terms){
print "$term\n";
}# end foreach
return @terms;
}# end sub
sub something{
my $character = shift;
my @list = @_;
my @array;
my $count = @list;
my $item = '';
for($c = 0; $c < $count; $c++){
my $character = &join_name($list[$c]);
$item = ' ';
$item .= '';
push(@array, $item);
}# end for
$item = ' ';
$item.='';
push(@array, $item);
$item = '';
$item.= ' ';
$item.= '';
$item.= $state;
$item.= '';
push(@array, $item);
push(@array, ' ');
for($c2 = 0; $c2 < $count; $c2++){
push(@array, '');
}# end for
return @array;
}# end sub
sub group_characters{
my $state = shift;
my @list = @_;
my @array;
my $last = &join_name(pop(@list));
my $count = @list;
for($c = 0; $c < $count; $c++){
my $character = &join_name($list[$c]);
push(@array,' ');
push(@array,'');
}# end for
push(@array, ' ');
push(@array, ' ' . $state . '');
push(@array, ' ');
for($c2 = 0; $c2 < $count; $c2++){
push(@array, ' ');
push(@array, '');
}# end for
return @array;
}# end sub
sub create_path{
my @list = @_;
my $path = join('/',@list);
#chomp($path);
$path =~ s/\s/_/g;
$path =~ s/_$//;
$path =~ s/_\//\//g;
#$path .= '/';
$path = '/' . $path . '/'; # I thought about putting / on both ends.
return $path;
}# end sub
sub new_character{
my $new = shift;
$new = &replace_lt($new);
push(@char_list, lc($new));
return lc($new);
}# end sub
sub replace_lt{
my $string = shift;
$string =~ s/\(\<\=/less than or equal to/g;
return $string;
}# end sub
sub list_states {
my @states = sort keys(%s_definition);
foreach $state (@states){
print "$state\n";
}# end foreach
}# end sub
sub print_hash{
my $hash_pointer = shift;
foreach $key ( sort alphaBYkey keys(%$hash_pointer) ) {
print "something\n";
}# end foreach
}# end sub
sub alphaBYkey { $hash{$a} <=> $hash{$b} }
sub clean_line{
my $li = shift;
my $new = $li;
$new =~ s/&/&/sg;
$new =~ s//GT/sg;
$new =~ s/\=/E/sg;
$new =~ s/[\x00-\x08\x0b-\x0c\x0e-\x1f]//sg;
$new =~ s/([\x80-\xff])/sprintf("%04x;",ord($1))/seg;
return $new;
}# end sub
sub clean_array{
my @old = @_;
my @new;
for($a = 0; $a <@old; $a++){
my $n = $old[$a];
$n =~ s/&/&/sg;
$n =~ s//GT/sg;
$n =~ s/\=/E/sg;
$n =~ s/[\x00-\x08\x0b-\x0c\x0e-\x1f]//sg;
$n =~ s/([\x80-\xff])/sprintf("%04x;",ord($1))/seg;
push (@new, $n);
}# end for
return @new;
}# end sub
sub join_name{
my $name = shift;
#chomp($name);
$name =~ s/\(/ /g;
$name =~ s/\)/ /g;
$name =~ s/\[/ /g;
$name =~ s/\]/ /g;
$name =~ s/\'//g;
$name =~ s/\"//g;
$name =~ s/\&/and/g;
$name =~ s/\,/ /g;
$name =~ s/\s/_/g;
$name =~ s/_+/_/g;
$name =~ s/_$//;
return $name;
}# end sub
#&get_out_file_name($name, $type);
sub get_out_file_name {
my $name = shift;
my $type = shift;
$num_out++;
#my $dir = '/home/openkey/public_html/' . $project . '/' . $dir_names{lc($type)} . '/xml/';
my $dir = $project . '/' . $dir_names{lc($type)} . '/xml/';
my $file = '';
$file = $dir . $name . '.xml';
#if($num_out > 1){
# $file = $dir . $name . $num_out . '.xml';
#}else{
# $file = $dir . $name . '.xml';
#}# end if
if(-e $file){ #if the file exists, then call this sub again so it can increase the accession_number
my $p = $parent{$name};
$p = &join_name($p);
my $p_file = $dir . $name . '_' . $p . '.xml';
#print "file is now $file\n";
#print "p_file is now $p_file\n";
$file_names{$p} = $p_file;
system("mv $file $p_file");
$name = $name . $char_list[0];
$file = &get_out_file_name($name, $type);
}# end if
system("touch $file");
system("chmod 2666 $file");
$file_names{$name} = $file;
$num_out = 0;
$parent{$name} = $char_list[0];
return $file;
}# end get_out_file_name
#&xml2file('State',$state_,@state_xml);
#&xml2file('Character', $char, @char_xml);
# used to be &xml2file('State',$state_dir,$state_dtd,$state_,@state_xml);
sub xml2file{
#my ($array_ref) = @_;
my $type = shift;
my $out = shift;
my @array = @_;
my $dtd = $dtds{lc($type)};
my $out_file = $file_names{$out};
#&get_out_file_name($out, $type);
#update the file list
open(OUT, ">$out_file") || die("cannot open $out_file: $!");
print OUT '', "\n";
my $item = '';
print OUT $item;
print OUT "\n";
$item = '';
print OUT $item;
print OUT "\n";
for($r=0; $r<@array; $r++){
print OUT $array[$r];
print OUT "\n";
}# end for
close(OUT);
system ("perl /home/openkey/bin/utf8.pl $out_file");
print "Just output the file $out_file\n";
}# end sub xml2file
sub handle_state{
my $state = lc(shift);
my $definition = shift;
my $related = '';
my $synonym = '';
if(($definition)&&($definition =~ /\w+/)){
if($definition =~ /Compare with\s(.*)\.\)/){
$related = lc($1);
}# end if
if($definition =~ /see\s(.*)\)/i){
$synonym = lc($1);
}# end if
if($definition =~ /(\[.*\])/){
# Then the source (or sources are mentioned)
#print "The source is $1\n";
}else{
# The definition was created by Stephen Seiberling. (see note below)
$definition .=' [Stephen Seiberling]';
}# end if
}else{
$definition = $state;
}# end if
#chomp($char_list[0]); # put this somewhere more appropriate
#$definition =~ s/^\s+//;
unless($char_list[0] =~ /definitions of characters and additional terms contained in definitions/){
$definition = '<em>(' . $char_list[0] . ')</em>' . " $definition";
# Should this be $char_list[0] or $char_list[-1]?
# It is more likely that -1 would change over time.
}# end unless
if($s_definition{$state}){ #there already was a definition, so now there might be a second one
my $old_def = $s_definition{$state};
$s_definition{$state} = $old_def . " $definition";
# What we really want to do: open the old file, put in a new definition line, and a new display section
#
#
}else{
# If the term has not been defined, then we have one here
$s_definition{$state} = $definition;
}# end if
my @characterGrouped = &group_characters($state,@char_list);
push(@state_list, $state);
my $state_ = &join_name($state);
push(@states, $state_);
my $state_file = &get_out_file_name($state_, state);
#push(@state_files, $state_file);
#my @related_states = &break_commas($related);
# -------------------- #
@state_xml = ('');
my $item = ' ' . $state . '';
push(@state_xml, $item);
$item = ' ' . $definition . ' ';
push(@state_xml, $item);
$item = ' none yet';
push(@state_xml, $item);
$item = ' ';
push(@state_xml, $item);
#yooooo hoooo
#@synonym
$item = ' ' . $synonym . '';
push(@state_xml, $item);
push(@state_xml, $item);
$item = ' ';
push(@state_xml, $item);
$item = ' ';
push(@state_xml, $item);
#yooooo hoooo
#@related,
$item = ' ' . $related . ' ';
push(@state_xml, $item);
$item = ' ';
push(@state_xml, $item);
$item = ' ';
push(@state_xml, $item);
$item = ' ';
push(@state_xml, $item);
#' <strong>',
#$char_list[-1],
#' </strong>:',
$item = ' ' . $state . ' ';
push(@state_xml, $item);
$item = ' ,';
push(@state_xml, $item);
$item = '';
push(@state_xml, $item);
&xml2file('State',$state_,@state_xml);
@state_xml = ();
return @characterGrouped;
}# end handle_state
sub get_todays_date {
my ($year, $mon, $day, $hour, $min) = (localtime)[5,4,3,2,1];#system ("date");
$year += 1900;
$mon += 1;
my @date = ($year, $mon, $day, $hour, $min);
my $date = join('_', @date);
return $date;
}# end sub
### NOTES ####
#Definitions with no source listed were written by Stephen Seiberling.
#http://www.cs.nott.ac.uk/~qiu/Teaching/SWTs/Perl-HandOut.html