#!/usr/bin/perl -w #split_definitions.pl # 2003 september 17 # This was written for 1 specific file. Definitions_of_Characters_and_2.txt # Just so we wouldn't have to retype everything in by hand. # Things to do: # - Fix the date stamp # - remove the accession number. # - put the multiple definitions in the same file # - create a webpage describing the input format and the output # - make it so the xml tags come from the current xml # - character and character group files # - when there are multiple definitions, do we want separate files? # --- like asymmetric -- (flowers, cones) # --- like axillary -- (axillary_flowers, axillary_cones, buds) # - files to check: (ones that would show problems) asymmetric, axillary, my @lines = <>; @lines = &clean_array(@lines); # takes care of special characters my @char_list = (); my @char_sibs = (); my @state_list = (); my @state_files = (); my @char_xml = (); my @state_xml = (); my @above = (); my @below = (); my @charGroup_xml = (); my @char_files = (); my @charGroup_files = (); my %file_names; my %dir_names = ('character' => 'character', 'charactergroup' => 'characterGroup', 'state' => 'state'); my %source; my %s_definition; %dtds = ('character' => 'http://www.isrl.uiuc.edu/~openkey/shared/character.dtd', 'state' => 'http://www.isrl.uiuc.edu/~openkey/shared/state.dtd', 'charactergroup' => 'http://www.isrl.uiuc.edu/~openkey/shared/characterGroup.dtd'); #my $state_dir = '/home/openkey/public_html/prairiepalnt/state/xml/'; #my $char_dir = '/home/openkey/public_html/SEastTree/character/xml/'; #my $charGroup_dir = '/home/openkey/public_html/SEastTree/characterGroup/xml/'; my $project = "plant"; $project = join_name($project); my $out2 = 'troubles'; # when there are troubles, they will print to a $out2 file my $iteration = 0; # for sub check_duplicates my $num_out = 0; # keeps track of accession number for($i = 0; $i < @lines; $i++){ chomp($lines[$i]); if($lines[$i] =~ /\w+/){ if($lines[$i] =~ /(.*)\s-\s(.*)/){ # It is a state with a definition @c_some = &handle_state($1,$2); }elsif($lines[$i] =~ /(.*)\t(.*)/){ # ($1)\t($2) # It is probably a reference or a page footer if($1 =~ /\s/){ # It is the page footer, and not really a reference # Example: Definitions_of_Characters_and_2.doc, 03/27/03 7:03 PM Page 1 of 1 }else{ # It really is a reference like this example: # RDMB Radford, Albert E., William C. Dickison, Jimmy R. Massey, and C. Ritchie Bell. # 1974. # Vascular Plant Systematics. # Harper & Row Publishers: New York, Evanston, San Francisco, London. $source{$1} = $2; ##print "Source abbreviation: $1 means: $2\n"; }# end if }else{ if($lines[$i-1]){ if($lines[$i-1] =~ /\w/){ #not sure what this is, it is not a character }else{ # This is a character if($lines[$i-2]){ if($lines[$i-2] =~ /\w/){ if($lines[$i-2] =~ /(.*)\s-\s(.*)/){ # if the last line was a state my $char = &handle_char(); push(@char_sibs,pop(@char_list)); #pop(@char_list); @state_list = (); &xml2file('Character',$char,@char_xml); }# end if }else{ if(@state_list >= 1){ my $char = &handle_char(); &xml2file('Character',$char,@char_xml); push(@char_sibs,pop(@char_list)); my $charGroup = &handle_charGroup(); &xml2file('characterGroup',$charGroup,@charGroup_xml); @state_list = (); }else{ print "Hey, no states! line was $lines[$i].\n"; }# end if @char_list = (); # a double line feed (between major charactergroups) @char_sibs = (); @state_list = (); }# end if if($lines[$i] =~ /^See (.*)/){ # implies something common like pubescent my $state2 = $1; if($state2 =~ /below/i){ my $term = $state2; $term =~ s/See\s//ig; $term =~ s/\sbelow.*//ig; $term =~ s/\s+/\s/g; #replace multiple spaces with a single space. my @terms = split(/and/,$term); push(@below, $char_list[-1]); }elsif($state2 =~ /above/i){ my $term = $state2; $term =~ s/See\s//ig; $term =~ s/\sabove.*//ig; $term =~ s/\s+/\s/g; #replace multiple spaces with a single space. my @terms = split(/and/,$term); push(@above, $char_list[-1]); }# $state2 = &join_name($state2); my $out2_file = &handle_char($state2,$lines[$i]); my $char = &handle_char(); push(@char_sibs,pop(@char_list)); @state_list = (); &xml2file('Character',$char,@char_xml); }else{ my $c = &new_character($lines[$i]); }# end if }# end if }# end if }# end if }# end if }# end if }# end for #&list_states(); sub handle_char{ my @list1 = @char_list; my @list2 = @state_list; my $char = pop(@list1); $char = &join_name($char); my $un_char = &unjoin($char); my $char_file = &get_out_file_name($char,'Character'); my $item = ''; @char_xml = (''); $item =' ' . $un_char . ''; push(@char_xml, $item); for ($st=0; $st<@list2; $st++){ my $state = $list2[$st]; $state = &join_name($state); my $un_state = &unjoin($state); my $long_state = $state . '_' . &join_name($char_list[0]); my $state_file = ''; if($file_names{$long_state}){ $state_file = $file_names{$long_state}; }elsif($file_names{$state}){ $state_file = $file_names{$state}; }else{ $state_file = get_out_file_name($out2,'State'); }# end if $item = '' . $un_state . ''; push(@char_xml,$item); }# end foreach $item = ' none yet'; push(@char_xml,$item); $item = ' ' . $un_char . ''; push(@char_xml,$item); push(@char_xml,' '); push(@char_xml,' '); push(@char_xml,' '); #for ($s=1; $s<@list1 -1; $s++){ # $item = ' ' . $list1[$s] . ''; # push(@char_xml,$item); #}# end foreach $item = ' ' . ''; push(@char_xml,$item); push(@char_xml,' '); $item = ' ' . ''; push(@char_xml,$item); push(@char_xml,' ,'); push(@char_xml,''); return $char; }# end sub sub handle_charGroup{ my @list1 = @char_list; my @list2 = @char_sibs; my $charGroup = $list1[0]; $charGroup = &join_name($charGroup); my $un_charGroup = &unjoin($charGroup); my $charGroup_file = &get_out_file_name($charGroup,'characterGroup'); my $item = ''; @charGroup_xml = (''); $item =' ' . $un_charGroup . ''; push(@charGroup_xml, $item); for ($st=0; $st<@list2; $st++){ my $char = $list2[$st]; $char = &join_name($char); my $un_char = &unjoin($char); my $char_file = $file_names{$char}; $item = '' . $un_char . ''; push(@charGroup_xml,$item); }# end foreach $item = ' none yet'; push(@charGroup_xml,$item); $item = ' ' . $un_charGroup . ''; push(@charGroup_xml,$item); push(@charGroup_xml,' '); push(@charGroup_xml,' '); push(@charGroup_xml,' '); #for ($s=1; $s<@list1 -1; $s++){ # $item = ' ' . $list1[$s] . ''; # push(@charGroup_xml,$item); #}# end foreach $item = ' ' . ''; push(@charGroup_xml,$item); push(@charGroup_xml,' <strong>'); $item = ' ' . $un_charGroup . ''; push(@charGroup_xml,$item); push(@charGroup_xml,' </strong>: '); push(@charGroup_xml,''); return $charGroup; }# end sub sub unjoin { my $name = shift; #chomp($name); $name =~ s/_/ /g; return $name; }# end sub unjoin #my @related = &break_related($related); sub break_related{ my $term = shift; my @list = &break_commas($term); my $count = @list; my @array; for($c = 0; $c < $count; $c++){ my $item = &join_name($list[$c]); #push(@array,' '); #push(@array,''.$item .''); push(@array,$item); }# end for return @array; }# end sub #--------------------------------- #Example problems with ands and , #see amorphic and haplomorphic #See Calyx Symmetry and Corolla Symmetry above #(see schiz. mericarp, schiz. nutlets & schiz. samaras) ##Seed Cone Shape Before Opening and Seed Cone Shape After Opening #(see illustrations for P. taeda, P. palustris and P. resinosa cones in K and elsewhere) #(see corm, rhizome & tiller) #(see both involute and rolled) #(see sterigma, not very good) #(see also woolly) #(see rhombic [poor example] and quadrate) #(see particular species for examples) #(see achene & berry) #(Also see leaf division.) #(Also see leaf complexity.) #Pubescence Type of Herbaceous Stems or Young Twigs (<= 1 year old) #See Pubescence below. #Other Surface Features of Herbaceous Stems or Young Twigs (<= 1 year old) #See Other Surface Features below. #Pubescence of 2-4-year-old Twigs #See Pubescence below. #Other Surface Features of 2-4-year-old Twigs #(see both crescent-shaped and broadly crescent) #(see Taxus baccata & Larix gemelini var. japonica) #Leaf Upper Surface Pubescence and Leaf Lower Surface Pubescence #See Pubescence below. #Leaf Upper Surface Pubescence Type and Leaf Lower Surface Pubescence Type #See Pubescence below. #Other Features of Leaf Upper Surface and Other Features of Leaf Lower Surface #See Other Surface Features below. #Leaf Venation and Leaflet Venation #(see pinnately netted, penni-parallel and reticulate) #--------------------------------- #my @synonyms = &handle_multiples($synonym); sub handle_multiples{ my $string = shift; my @stop_list = ('see', 'below', 'above', 'both', 'not very good', 'poor example', 'also'); my $string =~ s/\sand\s/,/ig; my $string =~ s/\s\&\s/,/g; foreach $stop(@stop_list){ my $string =~ s/$stop//ig; }# end foreach my @terms = split(/,/,$string); foreach $term(@terms){ print "$term\n"; }# end foreach return @terms; }# end sub sub something{ my $character = shift; my @list = @_; my @array; my $count = @list; my $item = ''; for($c = 0; $c < $count; $c++){ my $character = &join_name($list[$c]); $item = ' '; $item .= ''; push(@array, $item); }# end for $item = ' '; $item.=''; push(@array, $item); $item = ''; $item.= ' '; $item.= ''; $item.= $state; $item.= ''; push(@array, $item); push(@array, ' '); for($c2 = 0; $c2 < $count; $c2++){ push(@array, ''); }# end for return @array; }# end sub sub group_characters{ my $state = shift; my @list = @_; my @array; my $last = &join_name(pop(@list)); my $count = @list; for($c = 0; $c < $count; $c++){ my $character = &join_name($list[$c]); push(@array,' '); push(@array,''); }# end for push(@array, ' '); push(@array, ' ' . $state . ''); push(@array, ' '); for($c2 = 0; $c2 < $count; $c2++){ push(@array, ' '); push(@array, ''); }# end for return @array; }# end sub sub create_path{ my @list = @_; my $path = join('/',@list); #chomp($path); $path =~ s/\s/_/g; $path =~ s/_$//; $path =~ s/_\//\//g; #$path .= '/'; $path = '/' . $path . '/'; # I thought about putting / on both ends. return $path; }# end sub sub new_character{ my $new = shift; $new = &replace_lt($new); push(@char_list, lc($new)); return lc($new); }# end sub sub replace_lt{ my $string = shift; $string =~ s/\(\<\=/less than or equal to/g; return $string; }# end sub sub list_states { my @states = sort keys(%s_definition); foreach $state (@states){ print "$state\n"; }# end foreach }# end sub sub print_hash{ my $hash_pointer = shift; foreach $key ( sort alphaBYkey keys(%$hash_pointer) ) { print "something\n"; }# end foreach }# end sub sub alphaBYkey { $hash{$a} <=> $hash{$b} } sub clean_line{ my $li = shift; my $new = $li; $new =~ s/&/&/sg; $new =~ s//GT/sg; $new =~ s/\=/E/sg; $new =~ s/[\x00-\x08\x0b-\x0c\x0e-\x1f]//sg; $new =~ s/([\x80-\xff])/sprintf("&#x%04x;",ord($1))/seg; return $new; }# end sub sub clean_array{ my @old = @_; my @new; for($a = 0; $a <@old; $a++){ my $n = $old[$a]; $n =~ s/&/&/sg; $n =~ s//GT/sg; $n =~ s/\=/E/sg; $n =~ s/[\x00-\x08\x0b-\x0c\x0e-\x1f]//sg; $n =~ s/([\x80-\xff])/sprintf("&#x%04x;",ord($1))/seg; push (@new, $n); }# end for return @new; }# end sub sub join_name{ my $name = shift; #chomp($name); $name =~ s/\(/ /g; $name =~ s/\)/ /g; $name =~ s/\[/ /g; $name =~ s/\]/ /g; $name =~ s/\'//g; $name =~ s/\"//g; $name =~ s/\&/and/g; $name =~ s/\,/ /g; $name =~ s/\s/_/g; $name =~ s/_+/_/g; $name =~ s/_$//; return $name; }# end sub #&get_out_file_name($name, $type); sub get_out_file_name { my $name = shift; my $type = shift; $num_out++; #my $dir = '/home/openkey/public_html/' . $project . '/' . $dir_names{lc($type)} . '/xml/'; my $dir = $project . '/' . $dir_names{lc($type)} . '/xml/'; my $file = ''; $file = $dir . $name . '.xml'; #if($num_out > 1){ # $file = $dir . $name . $num_out . '.xml'; #}else{ # $file = $dir . $name . '.xml'; #}# end if if(-e $file){ #if the file exists, then call this sub again so it can increase the accession_number my $p = $parent{$name}; $p = &join_name($p); my $p_file = $dir . $name . '_' . $p . '.xml'; #print "file is now $file\n"; #print "p_file is now $p_file\n"; $file_names{$p} = $p_file; system("mv $file $p_file"); $name = $name . $char_list[0]; $file = &get_out_file_name($name, $type); }# end if system("touch $file"); system("chmod 2666 $file"); $file_names{$name} = $file; $num_out = 0; $parent{$name} = $char_list[0]; return $file; }# end get_out_file_name #&xml2file('State',$state_,@state_xml); #&xml2file('Character', $char, @char_xml); # used to be &xml2file('State',$state_dir,$state_dtd,$state_,@state_xml); sub xml2file{ #my ($array_ref) = @_; my $type = shift; my $out = shift; my @array = @_; my $dtd = $dtds{lc($type)}; my $out_file = $file_names{$out}; #&get_out_file_name($out, $type); #update the file list open(OUT, ">$out_file") || die("cannot open $out_file: $!"); print OUT '', "\n"; my $item = ''; print OUT $item; print OUT "\n"; $item = ''; print OUT $item; print OUT "\n"; for($r=0; $r<@array; $r++){ print OUT $array[$r]; print OUT "\n"; }# end for close(OUT); system ("perl /home/openkey/bin/utf8.pl $out_file"); print "Just output the file $out_file\n"; }# end sub xml2file sub handle_state{ my $state = lc(shift); my $definition = shift; my $related = ''; my $synonym = ''; if(($definition)&&($definition =~ /\w+/)){ if($definition =~ /Compare with\s(.*)\.\)/){ $related = lc($1); }# end if if($definition =~ /see\s(.*)\)/i){ $synonym = lc($1); }# end if if($definition =~ /(\[.*\])/){ # Then the source (or sources are mentioned) #print "The source is $1\n"; }else{ # The definition was created by Stephen Seiberling. (see note below) $definition .=' [Stephen Seiberling]'; }# end if }else{ $definition = $state; }# end if #chomp($char_list[0]); # put this somewhere more appropriate #$definition =~ s/^\s+//; unless($char_list[0] =~ /definitions of characters and additional terms contained in definitions/){ $definition = '<em>(' . $char_list[0] . ')</em>' . " $definition"; # Should this be $char_list[0] or $char_list[-1]? # It is more likely that -1 would change over time. }# end unless if($s_definition{$state}){ #there already was a definition, so now there might be a second one my $old_def = $s_definition{$state}; $s_definition{$state} = $old_def . " $definition"; # What we really want to do: open the old file, put in a new definition line, and a new display section # # }else{ # If the term has not been defined, then we have one here $s_definition{$state} = $definition; }# end if my @characterGrouped = &group_characters($state,@char_list); push(@state_list, $state); my $state_ = &join_name($state); push(@states, $state_); my $state_file = &get_out_file_name($state_, state); #push(@state_files, $state_file); #my @related_states = &break_commas($related); # -------------------- # @state_xml = (''); my $item = ' ' . $state . ''; push(@state_xml, $item); $item = ' ' . $definition . ' '; push(@state_xml, $item); $item = ' none yet'; push(@state_xml, $item); $item = ' '; push(@state_xml, $item); #yooooo hoooo #@synonym $item = ' ' . $synonym . ''; push(@state_xml, $item); push(@state_xml, $item); $item = ' '; push(@state_xml, $item); $item = ' '; push(@state_xml, $item); #yooooo hoooo #@related, $item = ' ' . $related . ' '; push(@state_xml, $item); $item = ' '; push(@state_xml, $item); $item = ' '; push(@state_xml, $item); $item = ' '; push(@state_xml, $item); #' <strong>', #$char_list[-1], #' </strong>:', $item = ' ' . $state . ' '; push(@state_xml, $item); $item = ' ,'; push(@state_xml, $item); $item = ''; push(@state_xml, $item); &xml2file('State',$state_,@state_xml); @state_xml = (); return @characterGrouped; }# end handle_state sub get_todays_date { my ($year, $mon, $day, $hour, $min) = (localtime)[5,4,3,2,1];#system ("date"); $year += 1900; $mon += 1; my @date = ($year, $mon, $day, $hour, $min); my $date = join('_', @date); return $date; }# end sub ### NOTES #### #Definitions with no source listed were written by Stephen Seiberling. #http://www.cs.nott.ac.uk/~qiu/Teaching/SWTs/Perl-HandOut.html