Difference between revisions of "Import CSV data into a wiki"

From Organic Design wiki
m (bug fix)
(Start adding template fetching and parsing ability)
Line 1: Line 1:
 
#!/usr/bin/perl
 
#!/usr/bin/perl
#{{perl}}{{#security:*|sysop}}{{Category:Robots}}
+
#
 
# - Licenced under LGPL (http://www.gnu.org/copyleft/lesser.html)
 
# - Licenced under LGPL (http://www.gnu.org/copyleft/lesser.html)
 
# - Author: http://www.organicdesign.co.nz/nad
 
# - Author: http://www.organicdesign.co.nz/nad
 
# - Source: http://www.organicdesign.co.nz/scraper.pl
 
# - Source: http://www.organicdesign.co.nz/scraper.pl
 
# - Started: 2008-03-21
 
# - Started: 2008-03-21
 
+
 
require('wiki.pl');
 
require('wiki.pl');
 
+
 
# Job, log and error files
 
# Job, log and error files
 
$ARGV[0] or die "No job file specified!";
 
$ARGV[0] or die "No job file specified!";
 
$ARGV[0] =~ /^(.+?)(\..+?)?$/;
 
$ARGV[0] =~ /^(.+?)(\..+?)?$/;
 +
 +
# Set a debug conditional
 +
$::debug = 1;
 +
 
$::log = "$1.log";
 
$::log = "$1.log";
 
$::err = "$1.err";
 
$::err = "$1.err";
Line 16: Line 20:
 
$::title = 0;
 
$::title = 0;
 
$::template = 'Record';
 
$::template = 'Record';
 
+
 
# Parse the job file
 
# Parse the job file
 
if (open JOB,'<',$ARGV[0]) {
 
if (open JOB,'<',$ARGV[0]) {
Line 30: Line 34:
 
close JOB;
 
close JOB;
 
} else { die "Couldn't parse job file!" }
 
} else { die "Couldn't parse job file!" }
 
+
  
 
# Open CSV file and read in headings line
 
# Open CSV file and read in headings line
Line 38: Line 42:
 
@headings = split /$::sep/i, $1;
 
@headings = split /$::sep/i, $1;
 
} else { die "Could not open CSV file!" }
 
} else { die "Could not open CSV file!" }
 
+
 
# Log in to the wiki
 
# Log in to the wiki
 
wikiLogin($::wiki,$::user,$::pass) or exit;
 
wikiLogin($::wiki,$::user,$::pass) or exit;
 +
 +
# fetch the template if it exists
 +
$response = $client->get("$wiki?title=$template&action=raw");
 +
if( $response->is_success ) {
 +
  $wikitext = $response->content;
 +
 +
 +
  # Remove noinclude areas
 +
  $wikitext =~ s/<noinclude>.+?<\/noinclude>//gs;
 +
 +
  # Find all unique {{{parameters}}}
 +
  # http://en.wikipedia.org/wiki/Help:Templates#Parameters
 +
 +
  while ($wikitext =~ m/\{{{(.+?)}}}/g ) {
 +
    $key = $1;
 +
    # Remove default anchors if there are any
 +
    $key =~ s/\|.*//;
 +
    $params{$key} = undef;
 +
  }
 +
 +
  if($::debug) {
 +
    print "@headings";
 +
    print "\n\n\n|@{[%params]}|\n\n\n";
 +
    die "[\$::debug set exiting]\n" ;
 +
  }
 +
 +
  
 
# Get batch size and current number (also later account for n-bots)
 
# Get batch size and current number (also later account for n-bots)
 
+
 
# todo: log batch start
 
# todo: log batch start
 
+
 
# Process the records
 
# Process the records
 
$n = 1;
 
$n = 1;
Line 55: Line 86:
 
$tmpl  .= "}}";
 
$tmpl  .= "}}";
 
print "Processing record ".$n++."\n";
 
print "Processing record ".$n++."\n";
 
+
 
# Update the record
 
# Update the record
 
$text  = wikiRawPage($::wiki,$record[$::title],0);
 
$text  = wikiRawPage($::wiki,$record[$::title],0);
 
$text .= "\n$tmpl" unless $text =~ s/\{\{$template.+?\}\}/$tmpl/is;
 
$text .= "\n$tmpl" unless $text =~ s/\{\{$template.+?\}\}/$tmpl/is;
 
$done  = wikiPageEdit($::wiki,$record[$::title],$text,"$template updated by csv2wiki.pl");
 
$done  = wikiPageEdit($::wiki,$record[$::title],$text,"$template updated by csv2wiki.pl");
 
+
 
# log a row error if any
 
# log a row error if any
 
}
 
}
 
+
 
close CSV;
 
close CSV;

Revision as of 12:25, 28 May 2008

  1. !/usr/bin/perl
  2. - Licenced under LGPL (http://www.gnu.org/copyleft/lesser.html)
  3. - Author: http://www.organicdesign.co.nz/nad
  4. - Source: http://www.organicdesign.co.nz/scraper.pl
  5. - Started: 2008-03-21

require('wiki.pl');

  1. Job, log and error files

$ARGV[0] or die "No job file specified!"; $ARGV[0] =~ /^(.+?)(\..+?)?$/;

  1. Set a debug conditional

$::debug = 1;

$::log = "$1.log"; $::err = "$1.err"; $::sep = ','; $::title = 0; $::template = 'Record';

  1. Parse the job file

if (open JOB,'<',$ARGV[0]) { for (<JOB>) { if (/^\*?\s*csv\s*:\s*(.+?)\s*$/i) { $::csv = $1 } if (/^\*?\s*wiki\s*:\s*(.+?)\s*$/i) { $::wiki = $1 } if (/^\*?\s*user\s*:\s*(.+?)\s*$/i) { $::user = $1 } if (/^\*?\s*pass\s*:\s*(.+?)\s*$/i) { $::pass = $1 } if (/^\*?\s*separator\s*:\s*"(.+?)"\s*$/i) { $::sep = $1 } if (/^\*?\s*title\s*:\s*(.+?)\s*$/i) { $::title = $1 } if (/^\*?\s*template\s*:\s*(.+?)\s*$/i) { $::template = $1 } } close JOB; } else { die "Couldn't parse job file!" }


  1. Open CSV file and read in headings line

if (open CSV,'<',$::csv) { $_ = <CSV>; /^\s*(.+?)\s*$/; @headings = split /$::sep/i, $1; } else { die "Could not open CSV file!" }

  1. Log in to the wiki

wikiLogin($::wiki,$::user,$::pass) or exit;

  1. fetch the template if it exists

$response = $client->get("$wiki?title=$template&action=raw"); if( $response->is_success ) {

 $wikitext = $response->content;


 # Remove noinclude areas
 $wikitext =~ s/.+?<\/noinclude>//gs;
 # Find all unique {{{parameters}}}
 # http://en.wikipedia.org/wiki/Help:Templates#Parameters
 while ($wikitext =~ m/\{{{(.+?)}}}/g ) {
   $key = $1;
   # Remove default anchors if there are any
   $key =~ s/\|.*//;
   $params{$key} = undef;
 }
 if($::debug) {
   print "@headings";
   print "\n\n\n|@{[%params]}|\n\n\n";
   die "[\$::debug set exiting]\n" ;
 }

}


  1. Get batch size and current number (also later account for n-bots)
  1. todo: log batch start
  1. Process the records

$n = 1; while (<CSV>) { /^\s*(.+?)\s*$/; @record = split /$::sep/, $1; $tmpl = "{{$template\n"; $tmpl .= "|$headings[$_] = $record[$_]\n" for 0..$#headings; $tmpl .= "}}"; print "Processing record ".$n++."\n";

# Update the record $text = wikiRawPage($::wiki,$record[$::title],0); $text .= "\n$tmpl" unless $text =~ s/\{\{$template.+?\}\}/$tmpl/is; $done = wikiPageEdit($::wiki,$record[$::title],$text,"$template updated by csv2wiki.pl");

# log a row error if any }

close CSV;