Difference between revisions of "Import CSV data into a wiki"

From Organic Design wiki
(remove protection)
(tidy up)
Line 39: Line 39:
 
if (/^\*?\s*prefix\s*:\s*(.+?)\s*$/i)      { $::prefix = $1 }
 
if (/^\*?\s*prefix\s*:\s*(.+?)\s*$/i)      { $::prefix = $1 }
 
if (/^\*?\s*append\s*:\s*(.+?)\s*$/i)      { $::append = $1 }
 
if (/^\*?\s*append\s*:\s*(.+?)\s*$/i)      { $::append = $1 }
}
+
}
 
close JOB;
 
close JOB;
 
} else { die "Couldn't parse job file!" }
 
} else { die "Couldn't parse job file!" }
Line 45: Line 45:
  
 
# Open CSV file and read in headings line
 
# Open CSV file and read in headings line
if (open CSV,'<',$::csv) {
+
if (open CSV, '<', $::csv) {
 
$_ = <CSV>;
 
$_ = <CSV>;
 
/^\s*(.+?)\s*$/;
 
/^\s*(.+?)\s*$/;
Line 56: Line 56:
 
# fetch the template if it exists
 
# fetch the template if it exists
 
$response = $client->get("$wiki?title=Template:$template&action=raw");
 
$response = $client->get("$wiki?title=Template:$template&action=raw");
if( $response->is_success ) {
+
if ($response->is_success) {
  $wikitext = $response->content;
+
$wikitext = $response->content;
  
  # Remove noinclude areas
+
# Remove noinclude areas
  $wikitext =~ s/<noinclude>.+?<\/noinclude>//gs;
+
$wikitext =~ s/<noinclude>.+?<\/noinclude>//gs;
  
  # Find all unique {{{parameters}}}
+
# Find all unique {{{parameters}}}
  # http://en.wikipedia.org/wiki/Help:Templates#Parameters
+
# http://en.wikipedia.org/wiki/Help:Templates#Parameters
  
  while ($wikitext =~ /\{{3}(.+?)(\|.*?)?\}{3}/g ) {
+
$params{$1} = undef while $wikitext =~ /\{{3}(.+?)(\|.*?)?\}{3}/g;
    $params{$1} = undef;
 
  }
 
  
  # Create %{param=index} hash
+
# Create %{param=index} hash
  foreach( $i = 0; $i <= $#headings; $i++ ) {
+
foreach ($i = 0; $i <= $#headings; $i++) {
      if(exists($params{$headings[$i]})) {
+
$params{$headings[$i]} = $i if exists $params{$headings[$i]};
      $params{$headings[$i]} = $i;
+
}
      }
 
  }
 
  
  if($::debug) {
+
if ($::debug) {
    print "\@headings: @headings\n";
+
print "\@headings: @headings\n";
    print "%params: @{[%params]}\n";
+
print "%params: @{[%params]}\n";
  }
+
}
 
}
 
}
 
    
 
    
Line 91: Line 87:
 
/^\s*(.+?)\s*$/;
 
/^\s*(.+?)\s*$/;
 
@record = split /$::sep/, $1;
 
@record = split /$::sep/, $1;
$tmpl   = "{{$template\n";
+
$tmpl = "{{$template\n";
$tmpl .= "|$_ = $record[$params{$_}]\n" foreach (keys %params);
+
$tmpl .= "|$_ = $record[$params{$_}]\n" foreach keys %params;
$tmpl .= "}}";
+
$tmpl .= "}}";
 
print "Processing record ".$n++."\n";
 
print "Processing record ".$n++."\n";
if($::debug) {
+
if ($::debug) {
 
    print "\$tmpl = $tmpl\n";
 
    print "\$tmpl = $tmpl\n";
 
    die  "[\$::debug set exiting]\n" ;
 
    die  "[\$::debug set exiting]\n" ;
 
}
 
}
 +
 
# Update the record
 
# Update the record
 
$text  = wikiRawPage($::wiki,$record[$::title],0);
 
$text  = wikiRawPage($::wiki,$record[$::title],0);
 
$text .= "\n$tmpl" unless $text =~ s/\{\{$template.+?\}\}/$tmpl/is;
 
$text .= "\n$tmpl" unless $text =~ s/\{\{$template.+?\}\}/$tmpl/is;
if($append) {
+
if ($append) {
$done = wikiPageAppend($::wiki,$::prefix . $record[$::title],$text,"[[Template:$::template|$::template]] appended using csv2wiki.pl");
+
$done = wikiPageAppend(
 +
$::wiki,
 +
$::prefix . $record[$::title],
 +
$text,
 +
"[[Template:$::template|$::template]] appended using csv2wiki.pl"
 +
);
 
} else {
 
} else {
$done = wikiPageEdit($::wiki,$::prefix . $record[$::title],$text,"[[Template:$::template|$::template]] replacement using csv2wiki.pl");
+
$done = wikiPageEdit(
 +
$::wiki,
 +
$::prefix . $record[$::title],
 +
$text,
 +
"[[Template:$::template|$::template]] replacement using csv2wiki.pl"
 +
);
 
  }
 
  }
 +
 
# log a row error if any
 
# log a row error if any
 
}
 
}
 
   
 
   
 
close CSV;
 
close CSV;

Revision as of 07:36, 4 September 2008

  1. !/usr/bin/perl
  2. Our Perl scripts.Automated scripts to perform batch automation.
  3. - Licenced under LGPL (http://www.gnu.org/copyleft/lesser.html)
  4. - Authors: Nad Sven
  5. - Source: http://www.organicdesign.co.nz/scraper.pl
  6. - Started: 2008-03-21
  7. - API: http://en.wikipedia.org/w/api.php
  1. Todo
  2. Make it so that if there is no title then it increments
  3. $hashref = { $wikitext =~ /\{{3}(.+?)(\|.*?)?\}{3}/g }

require('wiki.pl');

  1. Job, log and error files

$ARGV[0] or die "No job file specified!"; $ARGV[0] =~ /^(.+?)(\..+?)?$/;

  1. Set a debug conditional

$::debug = 0;

$::log = "$1.log"; $::err = "$1.err"; $::sep = ','; $::title = 0; $::template = 'Record'; $::prefix = ""; $::append = 0;

  1. Parse the job file

if (open JOB,'<',$ARGV[0]) { for (<JOB>) { if (/^\*?\s*csv\s*:\s*(.+?)\s*$/i) { $::csv = $1 } if (/^\*?\s*wiki\s*:\s*(.+?)\s*$/i) { $::wiki = $1 } if (/^\*?\s*user\s*:\s*(.+?)\s*$/i) { $::user = $1 } if (/^\*?\s*pass\s*:\s*(.+?)\s*$/i) { $::pass = $1 } if (/^\*?\s*separator\s*:\s*"(.+?)"\s*$/i) { $::sep = $1 } if (/^\*?\s*title\s*:\s*(.+?)\s*$/i) { $::title = $1 } if (/^\*?\s*template\s*:\s*(.+?)\s*$/i) { $::template = $1 } if (/^\*?\s*prefix\s*:\s*(.+?)\s*$/i) { $::prefix = $1 } if (/^\*?\s*append\s*:\s*(.+?)\s*$/i) { $::append = $1 } } close JOB; } else { die "Couldn't parse job file!" }


  1. Open CSV file and read in headings line

if (open CSV, '<', $::csv) { $_ = <CSV>; /^\s*(.+?)\s*$/; @headings = split /$::sep/i, $1; } else { die "Could not open CSV file!" }

  1. Log in to the wiki

wikiLogin($::wiki,$::user,$::pass) or exit;

  1. fetch the template if it exists

$response = $client->get("$wiki?title=Template:$template&action=raw"); if ($response->is_success) { $wikitext = $response->content;

# Remove noinclude areas $wikitext =~ s/.+?<\/noinclude>//gs;

# Find all unique {{{parameters}}} # http://en.wikipedia.org/wiki/Help:Templates#Parameters

$params{$1} = undef while $wikitext =~ /\{{3}(.+?)(\|.*?)?\}{3}/g;

# Create %{param=index} hash foreach ($i = 0; $i <= $#headings; $i++) { $params{$headings[$i]} = $i if exists $params{$headings[$i]}; }

if ($::debug) { print "\@headings: @headings\n"; print "%params: @{[%params]}\n"; } }

  1. Get batch size and current number (also later account for n-bots)
  1. todo: log batch start
  1. Process the records

$n = 1; while (<CSV>) { /^\s*(.+?)\s*$/; @record = split /$::sep/, $1; $tmpl = "{{$template\n"; $tmpl .= "|$_ = $record[$params{$_}]\n" foreach keys %params; $tmpl .= "}}"; print "Processing record ".$n++."\n"; if ($::debug) { print "\$tmpl = $tmpl\n"; die "[\$::debug set exiting]\n" ; }

# Update the record $text = wikiRawPage($::wiki,$record[$::title],0); $text .= "\n$tmpl" unless $text =~ s/\{\{$template.+?\}\}/$tmpl/is; if ($append) { $done = wikiPageAppend( $::wiki, $::prefix . $record[$::title], $text, "$::template appended using csv2wiki.pl" ); } else { $done = wikiPageEdit( $::wiki, $::prefix . $record[$::title], $text, "$::template replacement using csv2wiki.pl" );

	}

# log a row error if any }

close CSV;