Difference between revisions of "Lighthouse Parser.php"
From Organic Design wiki
({{legacy}}) |
m |
||
(One intermediate revision by the same user not shown) | |||
Line 1: | Line 1: | ||
{{legacy}} | {{legacy}} | ||
− | <php | + | <source lang="php" |
# Send input file through xpdf.pdftotext into tmp file and read into $file | # Send input file through xpdf.pdftotext into tmp file and read into $file | ||
$root=$_SERVER['DOCUMENT_ROOT']; | $root=$_SERVER['DOCUMENT_ROOT']; | ||
Line 17: | Line 17: | ||
<style><!--td{font-family:arial,sans-serif;font-size:11px;font-weight:bold}//--></style> | <style><!--td{font-family:arial,sans-serif;font-size:11px;font-weight:bold}//--></style> | ||
<table cellspacing=0 cellpadding=2 border=1 borderwidth=1 bordercolor=black> | <table cellspacing=0 cellpadding=2 border=1 borderwidth=1 bordercolor=black> | ||
− | <tr bgcolor=#9999cc><td>No.<td width=200>Name<td>Position <td width=120>Characteristic<td>Height<td>Range<td width=200>Structure<td>Remarks<td>Type<td>Chart<td>Area</tr><?} | + | <tr bgcolor=#9999cc><td>No.<td width=200>Name<td>Position |
+ | <td width=120>Characteristic<td>Height<td>Range<td width=200>Structure<td>Remarks<td>Type<td>Chart<td>Area</tr><?} | ||
else {?>No. Name Position Characteristic Height Range Structure Remarks Type Chart Area | else {?>No. Name Position Characteristic Height Range Structure Remarks Type Chart Area | ||
<?} | <?} | ||
Line 215: | Line 216: | ||
</html> | </html> | ||
<?}?> | <?}?> | ||
− | </ | + | </source> |
Latest revision as of 17:25, 18 June 2017
<html>
<style><!--td{font-family:arial,sans-serif;font-size:11px;font-weight:bold}//--></style>
<table cellspacing=0 cellpadding=2 border=1 borderwidth=1 bordercolor=black>
<tr bgcolor=#9999cc><td>No.<td width=200>Name<td>Position
<td width=120>Characteristic<td>Height<td>Range<td width=200>Structure<td>Remarks<td>Type<td>Chart<td>Area</tr><?}
else {?>No. Name Position Characteristic Height Range Structure Remarks Type Chart Area
<?}
# Extraction issues:
# 1. Assuming at least one empty line between entries
# 2. Column spacing is different on each page
# 3. Entries can have more than one number (same entry with multiple keys)
# 4. Content can start on line before number (due to layout inaccuracy in xpdf parser)
# 5. Position column header does not match content well, but position content is consistent
# _________________________________________________________________________________________________________________________________
#
# PARSE#1 - BUILD LAYOUT ARRAY
# Initialise page loop environment
$enlf="\n"; # chr used to separate multiple entry numbers
$separator="\t";
$last_line='';
$last_kind='';
$layout=array();
$pages=0;
$positions_found=array();
# Loop through lines, record column layout by page
foreach ($file as $line) {
# Determine action based on line content
if (preg_match("/^ +\\(1\\) +\\(2\\) +\\(3\\) +\\(4\\) +\\(5\\) +\\(6\\) +\\(7\\) +\\(8\\)/",$line)) $kind='page1';
elseif (($last_kind=='page1')&&preg_match("/^ +No\\. +Name and Location +Position +Characteristic +Height +Range +Structure +Remarks/",$line)) {
# Get most frequent location of Position content for cols 1-2 and 2-3 from page just done
$max=0; $p=0; foreach ($positions_found as $k=>$v) if ($v>$max) {$max=$v; $p=$k;}
$layout[$pages]['cols']['1-2']=$p;
$layout[$pages]['cols']['2-3']=$p+12;
$pages++;
# Last four boundaries work well off header text position at start of this new page
$layout[$pages]['cols']['3-4']=strpos($line,"Height")-1;
$layout[$pages]['cols']['4-5']=strpos($line,"Range");
$layout[$pages]['cols']['5-6']=strpos($line,"Range")+6;
$layout[$pages]['cols']['6-7']=(strpos($last_line,"7")+strpos($last_line,"8"))/2;
# Prepare for next page
$kind=='page2';
$positions_found=array();
}
elseif (preg_match("/^(.+)[0-9][0-9] [0-9][0-9] [0-9][0-9] [NW]/",$line,$m)) {
# Position data found in this entry, record its location
$positions_found[strlen($m[1])]++;
}
else $kind='general';
# Prepare for next line iteration
$last_line=$line;
$last_kind=$kind;
}
# _________________________________________________________________________________________________________________________________
#
# PARSE#2 - CONTENT EXTRACTION LOOP
# Initialise line-loop environment
$row=array('','','','','','','','');
$page=0;
$entries=array(''=>'dont render null rows');
$extracted=0;
$current_entry='';
$last_kind='';
$chart=0;
$area='';
# Loop through lines, extract & render content
foreach ($file as $line) {
# Determine the kind of line from content
$line=rtrim($line);
$last_kind=$kind;
if (preg_match("/^([0-9]+) +/",$line,$entry_match)) $kind='entry'; # Numbered entry
elseif (str_replace(" ","",$line)=="") $kind='empty'; # Empty
elseif (preg_match("/^ +\\(1\\) +\\(2\\) +\\(3\\) +\\(4\\) +\\(5\\) +\\(6\\) +\\(7\\) +\\(8\\)/",$line)) {
# New page
$page++;
//print "<tr bgcolor=#cc9999><td>Debug Info:<br>Page $page";
//foreach ($layout[$page]['cols'] as $k=>$v) print "<td>$k<br>$v";
//print "<td> <td> <td> <td> </tr>\n";
$kind='page';
}
elseif (preg_match("/\\(Chart ([0-9]+)\\)/",$line,$m)&&($current_entry=='')) {
# Chart
$kind='chart';
$chart=$m[1];
}
elseif (($last_kind=='empty')&&(!ereg("SECTION",$line)&&!ereg("^[ 0-9]+$",$line)&&preg_match("/^ {30,80}([^ ].+)$/",$line,$m))) {
# Area (type 1)
if (!ereg(" {2,}",$m[1])) {
$kind='area';
$area=$m[1];
}
}
else $kind='general';
# _________________________________________________________________________________________________________________________________
#
# ACCUMULATE COLUMN CONTENT FOR CURRENT MULTILINE ROW
# If line has an entry number, append to row[0] ($current_entry is the primary entry num only)
if ($kind=='entry') {
if ($current_entry) $row[0].=$enlf.$entry_match[1];
else $row[0]=$current_entry=$entry_match[1];
}
# If line is not empty, but also not a page header, then extract content into current row
if (($kind=='general')||($kind=='entry')) {
# Remove entry num if any because already dealt with separately
$tmp=ereg_replace("^([0-9]+)","",$line);
# Loop through content a word at a time (incl. spaces before each word for position info)
$current_position=strlen($line)-strlen($tmp); # Account for removed entry num
preg_match_all("/( +[^ ]+)/",$tmp,$words);
$last_col=-1;
foreach ($words[1] as $sw) {
# Get length of space and remove from word
preg_match("/^( +)([^ ]+)$/",$sw,$m);
$space=strlen($m[1]);
$word=$m[2];
# For each word assign to a column by which column word center falls within
$pos=$current_position+$space+1+strlen($word)/2;
if ($pos>$layout[$page]['cols']['6-7']) $col=7;
elseif ($pos>$layout[$page]['cols']['5-6']) $col=6;
elseif ($pos>$layout[$page]['cols']['4-5']) $col=5;
elseif ($pos>$layout[$page]['cols']['3-4']) $col=4;
elseif ($pos>$layout[$page]['cols']['2-3']) $col=3;
elseif ($pos>$layout[$page]['cols']['1-2']) $col=2;
else $col=1;
# Append the word and its space to it's assigned column
$current_position+=strlen($sw);
// if (($last_col!=$col)&&(ereg("^[^a-z]",$word))) $sw="*\n"+trim($sw);
if (($last_col!=$col)&&($row[$col])&&ereg("^ +[^ a-z]",$sw)) $sw=$enlf.trim($sw);
else $sw=" ".trim($sw);
$row[$col].=$sw;//."\{$pos}";
//$row[$col]=$colpos['23'].','.$colpos['34'].','.$colpos['45'].','.$colpos['56'].','.$colpos['67'].','.$colpos['78'];
$last_col=$col;
}
}
# _________________________________________________________________________________________________________________________________
#
# RENDER AND CLEAR CURRENT ROW
# If current line is empty, process current row content
if ($kind=='empty') {
# This primary-entry-number hasn't been marked as done, so render it now
if (!array_key_exists($current_entry,$entries)) {
# Post-process row (to get type from name col)
if (preg_match("/buoy/i",$row[1])) $row[8]='Bouy';
elseif (preg_match("/light([^e]|$)/i",$row[1])) $row[8]='Light';
else $row[8]='';
$row[9]=$chart;
$row[10]=$area;
# Render
if ($row[8]=='Light') {
if ($html) {
print "<tr valign=top>";
foreach ($row as $col) {
//$col=trim($col);
if ($col=='') print '<td> </td>';
else print '<td>'.ereg_replace("\n","<br>",htmlentities($col)).'</td>';
}
print "</tr>\n";
} else print ereg_replace("\n","\\n",join($separator,$row))."\n";
}
$extracted++;
}
# Clear row and prepare for new info
foreach (split($enlf,$row[0]) as $i) $entries[$i]=1; # Mark all numbers of this entry as done
//$entries[$current_entry]=true; # Mark just the primary number as done
$current_entry='';
$row=array('','','','','','','','');
}
}
# _________________________________________________________________________________________________________________________________
#
# Clean up and exit
if ($html) print "<tr><td colspan=11 align=center><font color=red>$pages pages found, $extracted entries extracted.</font></td></tr>";
//unlink($tmp);
if ($html) {?>
</table>
</html>
<?}?>