Difference between revisions of "Import CSV data into a wiki"

From Organic Design wiki
(Column selection from {{{params}}} in template now working)
(Is this newer, or older?)
Line 7: Line 7:
 
# - API:    http://en.wikipedia.org/w/api.php
 
# - API:    http://en.wikipedia.org/w/api.php
 
   
 
   
# Todo
 
# Make it so that if there is no title then it increments
 
# $hashref = { $wikitext =~ /\{{3}(.+?)(\|.*?)?\}{3}/g }
 
 
require('wiki.pl');
 
require('wiki.pl');
 
   
 
   
Line 17: Line 14:
  
 
# Set a debug conditional
 
# Set a debug conditional
$::debug = 1;
+
$::debug = 0;
  
 
$::log = "$1.log";
 
$::log = "$1.log";
Line 35: Line 32:
 
if (/^\*?\s*separator\s*:\s*"(.+?)"\s*$/i) { $::sep = $1 }
 
if (/^\*?\s*separator\s*:\s*"(.+?)"\s*$/i) { $::sep = $1 }
 
if (/^\*?\s*title\s*:\s*(.+?)\s*$/i)      { $::title = $1 }
 
if (/^\*?\s*title\s*:\s*(.+?)\s*$/i)      { $::title = $1 }
if (/^\*?\s*template\s*:\s*(.+?)\s*$/i)    { $::template = $1 }  
+
if (/^\*?\s*template\s*:\s*(.+?)\s*$/i)    { $::template = $1 }
if (/^\*?\s*prefix\s*:\s*(.+?)\s*$/i)     { $::prefix = $1 }
+
if (/^\*?\s*prefix\s*:\s*(.+?)\s*$/i)   { $::prefix = $1 }
 
}
 
}
 
close JOB;
 
close JOB;
Line 63: Line 60:
 
   # http://en.wikipedia.org/wiki/Help:Templates#Parameters
 
   # http://en.wikipedia.org/wiki/Help:Templates#Parameters
  
   while ($wikitext =~ /\{{3}(.+?)(\|.*?)?\}{3}/g ) {
+
   while ($wikitext =~ m/\{{{(.+?)}}}/g ) {
     $params{$1} = undef;
+
    $key = $1;
 +
    # Remove default anchors if there are any
 +
    $key =~ s/\|.*//;
 +
     $params{$key} = undef;
 
   }
 
   }
 
+
   # Create %{param=index} hash
+
   # Need to check @headings elements that are identical to keys(%param)
   foreach( $i = 0; $i <= $#headings; $i++ ) {
+
   for( $i = 0; $i < @headings; $i++ ) {
       if(exists($params{$headings[$i]})) {
+
        
       $params{$headings[$i]} = $i;
+
    if(exists($params{$headings[$i]})) {
      }
+
       print "$headings[$i] matches!\n";
 +
      push(@cols ,$i);
 +
    }
 
   }
 
   }
 
+
 
 
   if($::debug) {
 
   if($::debug) {
     print "\@headings: @headings\n";
+
     print "\@A: @headings\n";
     print "%params: @{[%params]}\n";
+
     print "%H: @{[%params]}\n";
 +
    print "\@cols @cols\n";
 +
    die  "[\$::debug set exiting]\n" ;
 
   }
 
   }
 
}
 
}
Line 90: Line 94:
 
@record = split /$::sep/, $1;
 
@record = split /$::sep/, $1;
 
$tmpl  = "{{$template\n";
 
$tmpl  = "{{$template\n";
$tmpl  .= "|$_ = $record[$params{$_}]\n" foreach (keys %params);
+
# JUST NEED TO FIX UP HERE
 +
        local $/="|";
 +
$tmpl  .= "|$headings[$_] = $record[$_]\n" for 0..$#headings;
 
$tmpl  .= "}}";
 
$tmpl  .= "}}";
 
print "Processing record ".$n++."\n";
 
print "Processing record ".$n++."\n";
if($::debug) {
+
    print "\$tmpl = $tmpl\n";
 
    die  "[\$::debug set exiting]\n" ;
 
}
 
 
# Update the record
 
# Update the record
 
$text  = wikiRawPage($::wiki,$record[$::title],0);
 
$text  = wikiRawPage($::wiki,$record[$::title],0);

Revision as of 23:19, 3 June 2008

  1. !/usr/bin/perl
  2. Our Perl scripts.{{#security:*|sysop}}Automated scripts to perform batch automation.
  3. - Licenced under LGPL (http://www.gnu.org/copyleft/lesser.html)
  4. - Author: http://www.organicdesign.co.nz/nad
  5. - Source: http://www.organicdesign.co.nz/scraper.pl
  6. - Started: 2008-03-21
  7. - API: http://en.wikipedia.org/w/api.php

require('wiki.pl');

  1. Job, log and error files

$ARGV[0] or die "No job file specified!"; $ARGV[0] =~ /^(.+?)(\..+?)?$/;

  1. Set a debug conditional

$::debug = 0;

$::log = "$1.log"; $::err = "$1.err"; $::sep = ','; $::title = 0; $::template = 'Record'; $::prefix = "";

  1. Parse the job file

if (open JOB,'<',$ARGV[0]) { for (<JOB>) { if (/^\*?\s*csv\s*:\s*(.+?)\s*$/i) { $::csv = $1 } if (/^\*?\s*wiki\s*:\s*(.+?)\s*$/i) { $::wiki = $1 } if (/^\*?\s*user\s*:\s*(.+?)\s*$/i) { $::user = $1 } if (/^\*?\s*pass\s*:\s*(.+?)\s*$/i) { $::pass = $1 } if (/^\*?\s*separator\s*:\s*"(.+?)"\s*$/i) { $::sep = $1 } if (/^\*?\s*title\s*:\s*(.+?)\s*$/i) { $::title = $1 } if (/^\*?\s*template\s*:\s*(.+?)\s*$/i) { $::template = $1 } if (/^\*?\s*prefix\s*:\s*(.+?)\s*$/i) { $::prefix = $1 } } close JOB; } else { die "Couldn't parse job file!" }


  1. Open CSV file and read in headings line

if (open CSV,'<',$::csv) { $_ = <CSV>; /^\s*(.+?)\s*$/; @headings = split /$::sep/i, $1; } else { die "Could not open CSV file!" }

  1. Log in to the wiki

wikiLogin($::wiki,$::user,$::pass) or exit;

  1. fetch the template if it exists

$response = $client->get("$wiki?title=Template:$template&action=raw"); if( $response->is_success ) {

 $wikitext = $response->content;
 # Remove noinclude areas
 $wikitext =~ s/.+?<\/noinclude>//gs;
 # Find all unique {{{parameters}}}
 # http://en.wikipedia.org/wiki/Help:Templates#Parameters
 while ($wikitext =~ m/\{{{(.+?)}}}/g ) {
   $key = $1;
   # Remove default anchors if there are any
   $key =~ s/\|.*//;
   $params{$key} = undef;
 }

 # Need to check @headings elements that are identical to keys(%param) 
 for( $i = 0; $i < @headings; $i++ ) {
     
   if(exists($params{$headings[$i]})) {
     print "$headings[$i] matches!\n";
     push(@cols ,$i);
   }
 }
 
 if($::debug) {
   print "\@A: @headings\n";
   print "%H: @{[%params]}\n";
   print "\@cols @cols\n";
   die   "[\$::debug set exiting]\n" ;
 }

}

  1. Get batch size and current number (also later account for n-bots)
  1. todo: log batch start
  1. Process the records

$n = 1; while (<CSV>) { /^\s*(.+?)\s*$/; @record = split /$::sep/, $1; $tmpl = "{{$template\n"; # JUST NEED TO FIX UP HERE

       local $/="|";

$tmpl .= "|$headings[$_] = $record[$_]\n" for 0..$#headings; $tmpl .= "}}"; print "Processing record ".$n++."\n";

# Update the record $text = wikiRawPage($::wiki,$record[$::title],0); $text .= "\n$tmpl" unless $text =~ s/\{\{$template.+?\}\}/$tmpl/is; $done = wikiPageEdit($::wiki,$record[$::title],$text,"$template updated by csv2wiki.pl");

# log a row error if any }

close CSV;