Difference between revisions of "Lighthouse Parser.php"

From Organic Design wiki
 
({{legacy}})
Line 1: Line 1:
<?
+
{{legacy}}
 +
<php><?
 
# Send input file through xpdf.pdftotext into tmp file and read into $file
 
# Send input file through xpdf.pdftotext into tmp file and read into $file
 
$root=$_SERVER['DOCUMENT_ROOT'];
 
$root=$_SERVER['DOCUMENT_ROOT'];
Line 214: Line 215:
 
</html>
 
</html>
 
<?}?>
 
<?}?>
 +
</php>

Revision as of 20:23, 30 May 2011

Legacy.svg Legacy: This article describes a concept that has been superseded in the course of ongoing development on the Organic Design wiki. Please do not develop this any further or base work on this concept, this is only useful for a historic record of work done. You may find a link to the currently used concept or function in this article, if not you can contact the author to find out what has taken the place of this legacy item.

<php><?

  1. Send input file through xpdf.pdftotext into tmp file and read into $file

$root=$_SERVER['DOCUMENT_ROOT']; $pdf=$HTTP_GET_VARS['pdf']; $tmp=tempnam($_ENV['tmp'],'xpdf'); $tmp=ereg_replace("pdf$","txt",$pdf); @system("$root\\..\\xpdf\\pdftotext -layout $pdf $tmp"); @$file=file($tmp) or die("Couldn't read or parse pdf file - supply pdf=filename in query-string");

  1. Output table headings:
  2. if $html is present in query-string, output is rendered as a table in an HTML doc,
  3. otherwise output is a tab-separated list

$html=array_key_exists('html',$HTTP_GET_VARS); if ($html) {?>

No. Name Position Characteristic Height Range Structure Remarks Type Chart Area $v) if ($v>$max) {$max=$v; $p=$k;} $layout[$pages]['cols']['1-2']=$p; $layout[$pages]['cols']['2-3']=$p+12; $pages++; # Last four boundaries work well off header text position at start of this new page $layout[$pages]['cols']['3-4']=strpos($line,"Height")-1; $layout[$pages]['cols']['4-5']=strpos($line,"Range"); $layout[$pages]['cols']['5-6']=strpos($line,"Range")+6; $layout[$pages]['cols']['6-7']=(strpos($last_line,"7")+strpos($last_line,"8"))/2; # Prepare for next page $kind=='page2'; $positions_found=array(); } elseif (preg_match("/^(.+)[0-9][0-9] [0-9][0-9] [0-9][0-9] [NW]/",$line,$m)) { # Position data found in this entry, record its location $positions_found[strlen($m[1])]++; } else $kind='general'; # Prepare for next line iteration $last_line=$line; $last_kind=$kind; } # _________________________________________________________________________________________________________________________________ # # PARSE#2 - CONTENT EXTRACTION LOOP # Initialise line-loop environment $row=array('','','','','','','',''); $page=0; $entries=array(''=>'dont render null rows'); $extracted=0; $current_entry=''; $last_kind=''; $chart=0; $area=''; # Loop through lines, extract & render content foreach ($file as $line) { # Determine the kind of line from content $line=rtrim($line); $last_kind=$kind; if (preg_match("/^([0-9]+) +/",$line,$entry_match)) $kind='entry'; # Numbered entry elseif (str_replace(" ","",$line)=="") $kind='empty'; # Empty elseif (preg_match("/^ +\\(1\\) +\\(2\\) +\\(3\\) +\\(4\\) +\\(5\\) +\\(6\\) +\\(7\\) +\\(8\\)/",$line)) { # New page $page++; //print "\n"; $kind='page'; } elseif (preg_match("/\\(Chart ([0-9]+)\\)/",$line,$m)&&($current_entry=='')) { # Chart $kind='chart'; $chart=$m[1]; } elseif (($last_kind=='empty')&&(!ereg("SECTION",$line)&&!ereg("^[ 0-9]+$",$line)&&preg_match("/^ {30,80}([^ ].+)$/",$line,$m))) { # Area (type 1) if (!ereg(" {2,}",$m[1])) { $kind='area'; $area=$m[1]; } } else $kind='general'; # _________________________________________________________________________________________________________________________________ # # ACCUMULATE COLUMN CONTENT FOR CURRENT MULTILINE ROW # If line has an entry number, append to row[0] ($current_entry is the primary entry num only) if ($kind=='entry') { if ($current_entry) $row[0].=$enlf.$entry_match[1]; else $row[0]=$current_entry=$entry_match[1]; } # If line is not empty, but also not a page header, then extract content into current row if (($kind=='general')||($kind=='entry')) { # Remove entry num if any because already dealt with separately $tmp=ereg_replace("^([0-9]+)","",$line); # Loop through content a word at a time (incl. spaces before each word for position info) $current_position=strlen($line)-strlen($tmp); # Account for removed entry num preg_match_all("/( +[^ ]+)/",$tmp,$words); $last_col=-1; foreach ($words[1] as $sw) { # Get length of space and remove from word preg_match("/^( +)([^ ]+)$/",$sw,$m); $space=strlen($m[1]); $word=$m[2]; # For each word assign to a column by which column word center falls within $pos=$current_position+$space+1+strlen($word)/2; if ($pos>$layout[$page]['cols']['6-7']) $col=7; elseif ($pos>$layout[$page]['cols']['5-6']) $col=6; elseif ($pos>$layout[$page]['cols']['4-5']) $col=5; elseif ($pos>$layout[$page]['cols']['3-4']) $col=4; elseif ($pos>$layout[$page]['cols']['2-3']) $col=3; elseif ($pos>$layout[$page]['cols']['1-2']) $col=2; else $col=1; # Append the word and its space to it's assigned column $current_position+=strlen($sw); // if (($last_col!=$col)&&(ereg("^[^a-z]",$word))) $sw="*\n"+trim($sw); if (($last_col!=$col)&&($row[$col])&&ereg("^ +[^ a-z]",$sw)) $sw=$enlf.trim($sw); else $sw=" ".trim($sw); $row[$col].=$sw;//."\{$pos}"; //$row[$col]=$colpos['23'].','.$colpos['34'].','.$colpos['45'].','.$colpos['56'].','.$colpos['67'].','.$colpos['78']; $last_col=$col; } } # _________________________________________________________________________________________________________________________________ # # RENDER AND CLEAR CURRENT ROW # If current line is empty, process current row content if ($kind=='empty') { # This primary-entry-number hasn't been marked as done, so render it now if (!array_key_exists($current_entry,$entries)) { # Post-process row (to get type from name col) if (preg_match("/buoy/i",$row[1])) $row[8]='Bouy'; elseif (preg_match("/light([^e]|$)/i",$row[1])) $row[8]='Light'; else $row[8]=''; $row[9]=$chart; $row[10]=$area; # Render if ($row[8]=='Light') { if ($html) { print ""; foreach ($row as $col) { //$col=trim($col); if ($col=='') print ''; else print ''; } print "\n"; } else print ereg_replace("\n","\\n",join($separator,$row))."\n"; } $extracted++; } # Clear row and prepare for new info foreach (split($enlf,$row[0]) as $i) $entries[$i]=1; # Mark all numbers of this entry as done //$entries[$current_entry]=true; # Mark just the primary number as done $current_entry=''; $row=array('','','','','','','',''); } } # _________________________________________________________________________________________________________________________________ # # Clean up and exit if ($html) print ""; //unlink($tmp); if ($html) {?>
No.NamePosition      CharacteristicHeightRangeStructureRemarksTypeChartArea
Debug Info:
Page $page"; //foreach ($layout[$page]['cols'] as $k=>$v) print "
$k
$v"; //print "
    
 '.ereg_replace("\n","
",htmlentities($col)).'
$pages pages found, $extracted entries extracted.

<?}?> </php>