Difference between revisions of "Lighthouse Parser.php"

From Organic Design wiki
 
m
 
(2 intermediate revisions by the same user not shown)
Line 1: Line 1:
<?
+
{{legacy}}
 +
<source lang="php"
 
# Send input file through xpdf.pdftotext into tmp file and read into $file
 
# Send input file through xpdf.pdftotext into tmp file and read into $file
 
$root=$_SERVER['DOCUMENT_ROOT'];
 
$root=$_SERVER['DOCUMENT_ROOT'];
Line 16: Line 17:
 
<style><!--td{font-family:arial,sans-serif;font-size:11px;font-weight:bold}//--></style>
 
<style><!--td{font-family:arial,sans-serif;font-size:11px;font-weight:bold}//--></style>
 
<table cellspacing=0 cellpadding=2 border=1 borderwidth=1 bordercolor=black>
 
<table cellspacing=0 cellpadding=2 border=1 borderwidth=1 bordercolor=black>
<tr bgcolor=#9999cc><td>No.<td width=200>Name<td>Position&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<td width=120>Characteristic<td>Height<td>Range<td width=200>Structure<td>Remarks<td>Type<td>Chart<td>Area</tr><?}
+
<tr bgcolor=#9999cc><td>No.<td width=200>Name<td>Position&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;
 +
<td width=120>Characteristic<td>Height<td>Range<td width=200>Structure<td>Remarks<td>Type<td>Chart<td>Area</tr><?}
 
else {?>No. Name Position Characteristic Height Range Structure Remarks Type Chart Area
 
else {?>No. Name Position Characteristic Height Range Structure Remarks Type Chart Area
 
<?}
 
<?}
Line 214: Line 216:
 
</html>
 
</html>
 
<?}?>
 
<?}?>
 +
</source>

Latest revision as of 17:25, 18 June 2017

Legacy.svg Legacy: This article describes a concept that has been superseded in the course of ongoing development on the Organic Design wiki. Please do not develop this any further or base work on this concept, this is only useful for a historic record of work done. You may find a link to the currently used concept or function in this article, if not you can contact the author to find out what has taken the place of this legacy item.
<html>
<style><!--td{font-family:arial,sans-serif;font-size:11px;font-weight:bold}//--></style>
<table cellspacing=0 cellpadding=2 border=1 borderwidth=1 bordercolor=black>
<tr bgcolor=#9999cc><td>No.<td width=200>Name<td>Position&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;
<td width=120>Characteristic<td>Height<td>Range<td width=200>Structure<td>Remarks<td>Type<td>Chart<td>Area</tr><?}
else {?>No.	Name	Position	Characteristic	Height	Range	Structure	Remarks	Type	Chart	Area
<?}

	# Extraction issues:
	# 	1. Assuming at least one empty line between entries
	#	2. Column spacing is different on each page
	#	3. Entries can have more than one number (same entry with multiple keys)
	#	4. Content can start on line before number (due to layout inaccuracy in xpdf parser)
	#	5. Position column header does not match content well, but position content is consistent

# _________________________________________________________________________________________________________________________________
#
# PARSE#1 - BUILD LAYOUT ARRAY

# Initialise page loop environment
$enlf="\n"; # chr used to separate multiple entry numbers
$separator="\t";
$last_line='';
$last_kind='';
$layout=array();
$pages=0;
$positions_found=array();

# Loop through lines, record column layout by page
foreach ($file as $line) {

	# Determine action based on line content
	if (preg_match("/^ +\\(1\\) +\\(2\\) +\\(3\\) +\\(4\\) +\\(5\\) +\\(6\\) +\\(7\\) +\\(8\\)/",$line)) $kind='page1';
	elseif (($last_kind=='page1')&&preg_match("/^ +No\\. +Name and Location +Position +Characteristic +Height +Range +Structure +Remarks/",$line)) {

		# Get most frequent location of Position content for cols 1-2 and 2-3 from page just done
		$max=0; $p=0; foreach ($positions_found as $k=>$v) if ($v>$max) {$max=$v; $p=$k;}
		$layout[$pages]['cols']['1-2']=$p;
		$layout[$pages]['cols']['2-3']=$p+12;

		$pages++;

		# Last four boundaries work well off header text position at start of this new page
		$layout[$pages]['cols']['3-4']=strpos($line,"Height")-1;
		$layout[$pages]['cols']['4-5']=strpos($line,"Range");
		$layout[$pages]['cols']['5-6']=strpos($line,"Range")+6;
		$layout[$pages]['cols']['6-7']=(strpos($last_line,"7")+strpos($last_line,"8"))/2;

		# Prepare for next page
		$kind=='page2';
		$positions_found=array();
		}
	elseif (preg_match("/^(.+)[0-9][0-9] [0-9][0-9] [0-9][0-9] [NW]/",$line,$m)) {
		# Position data found in this entry, record its location
		$positions_found[strlen($m[1])]++;
		}
	else $kind='general';
		
	# Prepare for next line iteration
	$last_line=$line;
	$last_kind=$kind;
	}

# _________________________________________________________________________________________________________________________________
#
# PARSE#2 - CONTENT EXTRACTION LOOP

# Initialise line-loop environment
$row=array('','','','','','','','');
$page=0;
$entries=array(''=>'dont render null rows');
$extracted=0;
$current_entry='';
$last_kind='';
$chart=0;
$area='';

# Loop through lines, extract & render content
foreach ($file as $line) {

	# Determine the kind of line from content
	$line=rtrim($line);

	$last_kind=$kind;
	if (preg_match("/^([0-9]+) +/",$line,$entry_match)) $kind='entry';	# Numbered entry
	elseif (str_replace(" ","",$line)=="") $kind='empty';				# Empty
	elseif (preg_match("/^ +\\(1\\) +\\(2\\) +\\(3\\) +\\(4\\) +\\(5\\) +\\(6\\) +\\(7\\) +\\(8\\)/",$line)) {
		# New page
		$page++;
		//print "<tr bgcolor=#cc9999><td>Debug Info:<br>Page $page";
		//foreach ($layout[$page]['cols'] as $k=>$v) print "<td>$k<br>$v";
		//print "<td>&nbsp;<td>&nbsp;<td>&nbsp;<td>&nbsp;</tr>\n";
		$kind='page';
		}
	elseif (preg_match("/\\(Chart ([0-9]+)\\)/",$line,$m)&&($current_entry=='')) {
		# Chart
		$kind='chart';
		$chart=$m[1];
		}
	elseif (($last_kind=='empty')&&(!ereg("SECTION",$line)&&!ereg("^[ 0-9]+$",$line)&&preg_match("/^ {30,80}([^ ].+)$/",$line,$m))) {
		# Area (type 1)
		if (!ereg(" {2,}",$m[1])) {
			$kind='area';
			$area=$m[1];
			}
		}
	else $kind='general';

	
	# _________________________________________________________________________________________________________________________________
	#
	# ACCUMULATE COLUMN CONTENT FOR CURRENT MULTILINE ROW

	# If line has an entry number, append to row[0] ($current_entry is the primary entry num only)
	if ($kind=='entry') {
		if ($current_entry) $row[0].=$enlf.$entry_match[1];
		else $row[0]=$current_entry=$entry_match[1];
		}

	# If line is not empty, but also not a page header, then extract content into current row
	if (($kind=='general')||($kind=='entry')) {
		# Remove entry num if any because already dealt with separately
		$tmp=ereg_replace("^([0-9]+)","",$line);
		# Loop through content a word at a time (incl. spaces before each word for position info)
		$current_position=strlen($line)-strlen($tmp); # Account for removed entry num
		preg_match_all("/( +[^ ]+)/",$tmp,$words);
		$last_col=-1;
		foreach ($words[1] as $sw) {
			# Get length of space and remove from word
			preg_match("/^( +)([^ ]+)$/",$sw,$m);
			$space=strlen($m[1]);
			$word=$m[2];
			# For each word assign to a column by which column word center falls within
			$pos=$current_position+$space+1+strlen($word)/2;
			if ($pos>$layout[$page]['cols']['6-7']) $col=7;
			elseif ($pos>$layout[$page]['cols']['5-6']) $col=6;
			elseif ($pos>$layout[$page]['cols']['4-5']) $col=5;
			elseif ($pos>$layout[$page]['cols']['3-4']) $col=4;
			elseif ($pos>$layout[$page]['cols']['2-3']) $col=3;
			elseif ($pos>$layout[$page]['cols']['1-2']) $col=2;
			else $col=1;
			# Append the word and its space to it's assigned column
			$current_position+=strlen($sw);
//			if (($last_col!=$col)&&(ereg("^[^a-z]",$word))) $sw="*\n"+trim($sw);
			if (($last_col!=$col)&&($row[$col])&&ereg("^ +[^ a-z]",$sw)) $sw=$enlf.trim($sw);
			else $sw=" ".trim($sw);
			$row[$col].=$sw;//."\{$pos}";
			//$row[$col]=$colpos['23'].','.$colpos['34'].','.$colpos['45'].','.$colpos['56'].','.$colpos['67'].','.$colpos['78'];
			$last_col=$col;
			}
		}
	
	# _________________________________________________________________________________________________________________________________
	#
	# RENDER AND CLEAR CURRENT ROW

	# If current line is empty, process current row content

	if ($kind=='empty') {
	

		# This primary-entry-number hasn't been marked as done, so render it now
		if (!array_key_exists($current_entry,$entries)) {
			# Post-process row (to get type from name col)
			if (preg_match("/buoy/i",$row[1])) $row[8]='Bouy';
			elseif (preg_match("/light([^e]|$)/i",$row[1])) $row[8]='Light';
			else $row[8]='';
			$row[9]=$chart;
			$row[10]=$area;
			# Render
if ($row[8]=='Light') {
			if ($html) {
				print "<tr valign=top>";
				foreach ($row as $col) {
					//$col=trim($col);
					if ($col=='') print '<td>&nbsp;</td>';
					else print '<td>'.ereg_replace("\n","<br>",htmlentities($col)).'</td>';
					}
				print "</tr>\n";
				} else print ereg_replace("\n","\\n",join($separator,$row))."\n";
}
			$extracted++;
			}

		# Clear row and prepare for new info
		foreach (split($enlf,$row[0]) as $i) $entries[$i]=1; # Mark all numbers of this entry as done
		//$entries[$current_entry]=true; # Mark just the primary number as done
		$current_entry='';
		$row=array('','','','','','','','');
		}
	}

# _________________________________________________________________________________________________________________________________
#

# Clean up and exit
if ($html) print "<tr><td colspan=11 align=center><font color=red>$pages pages found, $extracted entries extracted.</font></td></tr>";
//unlink($tmp);
if ($html) {?>
</table>
</html>
<?}?>