Lighthouse Parser.php
From Organic Design wiki
<?
- Send input file through xpdf.pdftotext into tmp file and read into $file
$root=$_SERVER['DOCUMENT_ROOT']; $pdf=$HTTP_GET_VARS['pdf']; $tmp=tempnam($_ENV['tmp'],'xpdf'); $tmp=ereg_replace("pdf$","txt",$pdf); @system("$root\\..\\xpdf\\pdftotext -layout $pdf $tmp"); @$file=file($tmp) or die("Couldn't read or parse pdf file - supply pdf=filename in query-string");
- Output table headings:
- if $html is present in query-string, output is rendered as a table in an HTML doc,
- otherwise output is a tab-separated list
$html=array_key_exists('html',$HTTP_GET_VARS); if ($html) {?>
No. Name Position Characteristic Height Range Structure Remarks Type Chart Area $v) if ($v>$max) {$max=$v; $p=$k;} $layout[$pages]['cols']['1-2']=$p; $layout[$pages]['cols']['2-3']=$p+12; $pages++; # Last four boundaries work well off header text position at start of this new page $layout[$pages]['cols']['3-4']=strpos($line,"Height")-1; $layout[$pages]['cols']['4-5']=strpos($line,"Range"); $layout[$pages]['cols']['5-6']=strpos($line,"Range")+6; $layout[$pages]['cols']['6-7']=(strpos($last_line,"7")+strpos($last_line,"8"))/2; # Prepare for next page $kind=='page2'; $positions_found=array(); } elseif (preg_match("/^(.+)[0-9][0-9] [0-9][0-9] [0-9][0-9] [NW]/",$line,$m)) { # Position data found in this entry, record its location $positions_found[strlen($m[1])]++; } else $kind='general'; # Prepare for next line iteration $last_line=$line; $last_kind=$kind; } # _________________________________________________________________________________________________________________________________ # # PARSE#2 - CONTENT EXTRACTION LOOP # Initialise line-loop environment $row=array('','','','','','','',''); $page=0; $entries=array(''=>'dont render null rows'); $extracted=0; $current_entry=''; $last_kind=''; $chart=0; $area=''; # Loop through lines, extract & render content foreach ($file as $line) { # Determine the kind of line from content $line=rtrim($line); $last_kind=$kind; if (preg_match("/^([0-9]+) +/",$line,$entry_match)) $kind='entry'; # Numbered entry elseif (str_replace(" ","",$line)=="") $kind='empty'; # Empty elseif (preg_match("/^ +\\(1\\) +\\(2\\) +\\(3\\) +\\(4\\) +\\(5\\) +\\(6\\) +\\(7\\) +\\(8\\)/",$line)) { # New page $page++; //print "\n"; $kind='page'; } elseif (preg_match("/\\(Chart ([0-9]+)\\)/",$line,$m)&&($current_entry=='')) { # Chart $kind='chart'; $chart=$m[1]; } elseif (($last_kind=='empty')&&(!ereg("SECTION",$line)&&!ereg("^[ 0-9]+$",$line)&&preg_match("/^ {30,80}([^ ].+)$/",$line,$m))) { # Area (type 1) if (!ereg(" {2,}",$m[1])) { $kind='area'; $area=$m[1]; } } else $kind='general'; # _________________________________________________________________________________________________________________________________ # # ACCUMULATE COLUMN CONTENT FOR CURRENT MULTILINE ROW # If line has an entry number, append to row[0] ($current_entry is the primary entry num only) if ($kind=='entry') { if ($current_entry) $row[0].=$enlf.$entry_match[1]; else $row[0]=$current_entry=$entry_match[1]; } # If line is not empty, but also not a page header, then extract content into current row if (($kind=='general')||($kind=='entry')) { # Remove entry num if any because already dealt with separately $tmp=ereg_replace("^([0-9]+)","",$line); # Loop through content a word at a time (incl. spaces before each word for position info) $current_position=strlen($line)-strlen($tmp); # Account for removed entry num preg_match_all("/( +[^ ]+)/",$tmp,$words); $last_col=-1; foreach ($words[1] as $sw) { # Get length of space and remove from word preg_match("/^( +)([^ ]+)$/",$sw,$m); $space=strlen($m[1]); $word=$m[2]; # For each word assign to a column by which column word center falls within $pos=$current_position+$space+1+strlen($word)/2; if ($pos>$layout[$page]['cols']['6-7']) $col=7; elseif ($pos>$layout[$page]['cols']['5-6']) $col=6; elseif ($pos>$layout[$page]['cols']['4-5']) $col=5; elseif ($pos>$layout[$page]['cols']['3-4']) $col=4; elseif ($pos>$layout[$page]['cols']['2-3']) $col=3; elseif ($pos>$layout[$page]['cols']['1-2']) $col=2; else $col=1; # Append the word and its space to it's assigned column $current_position+=strlen($sw); // if (($last_col!=$col)&&(ereg("^[^a-z]",$word))) $sw="*\n"+trim($sw); if (($last_col!=$col)&&($row[$col])&&ereg("^ +[^ a-z]",$sw)) $sw=$enlf.trim($sw); else $sw=" ".trim($sw); $row[$col].=$sw;//."\{$pos}"; //$row[$col]=$colpos['23'].','.$colpos['34'].','.$colpos['45'].','.$colpos['56'].','.$colpos['67'].','.$colpos['78']; $last_col=$col; } } # _________________________________________________________________________________________________________________________________ # # RENDER AND CLEAR CURRENT ROW # If current line is empty, process current row content if ($kind=='empty') { # This primary-entry-number hasn't been marked as done, so render it now if (!array_key_exists($current_entry,$entries)) { # Post-process row (to get type from name col) if (preg_match("/buoy/i",$row[1])) $row[8]='Bouy'; elseif (preg_match("/light([^e]|$)/i",$row[1])) $row[8]='Light'; else $row[8]=''; $row[9]=$chart; $row[10]=$area; # Render if ($row[8]=='Light') { if ($html) { print ""; foreach ($row as $col) { //$col=trim($col); if ($col=='') print ''; else print ''; } print "\n"; } else print ereg_replace("\n","\\n",join($separator,$row))."\n"; } $extracted++; } # Clear row and prepare for new info foreach (split($enlf,$row[0]) as $i) $entries[$i]=1; # Mark all numbers of this entry as done //$entries[$current_entry]=true; # Mark just the primary number as done $current_entry=''; $row=array('','','','','','','',''); } } # _________________________________________________________________________________________________________________________________ # # Clean up and exit if ($html) print ""; //unlink($tmp); if ($html) {?>No. | Name | Position | Characteristic | Height | Range | Structure | Remarks | Type | Chart | Area |
Debug Info: Page $page"; //foreach ($layout[$page]['cols'] as $k=>$v) print " | $k $v"; //print " | |||||||||
'.ereg_replace("\n"," ",htmlentities($col)).' | ||||||||||
$pages pages found, $extracted entries extracted. |
<?}?>