#!/usr/bin/env ruby
# This script is called on the final sorted, de-spammed revision
# XML file.
#
# It doesn't currently check for no-op revisions... I believe
# that git-fast-load will dutifully load them even though nothing
# happened. I don't care to solve this by adding a file cache
# to this script. You can run iki-diff-next.rb to highlight any
# empty revisions that need to be removed.
#
# This turns each node into an equivalent file.
# It does not convert spaces to underscores in file names.
# This would break wikilinks.
# I suppose you could fix this with mod_speling or mod_rewrite.
#
# It replaces nodes in the Image: namespace with the files themselves.
require 'rubygems'
require './node-callback'
require 'time'
require 'ostruct'
# pipe is the stream to receive the git-fast-import commands
# putfrom is true if this branch has existing commits on it, false if not.
def format_git_commit(pipe, f)
# Need to escape backslashes and double-quotes for git?
# No, git breaks when I do this.
# For the filename "path with \\", git sez: bad default revision 'HEAD'
# filename = '"' + filename.gsub('\\', '\\\\\\\\').gsub('"', '\\"') + '"'
# In the calls below, length must be the size in bytes!!
# TODO: I haven't figured out how this works in the land of UTF8 and Ruby 1.9.
pipe.puts "commit #{f.branch}"
pipe.puts "committer #{f.username} <#{f.email}> #{f.timestamp.rfc2822}"
pipe.puts "data #{f.message.length}\n#{f.message}\n"
pipe.puts "from #{f.branch}^0" if f.putfrom
pipe.puts "M 644 inline #{f.filename}"
pipe.puts "data #{f.content.length}\n#{f.content}\n"
pipe.puts
end
# This just prints the revisions in a human-readable format.
def add_git_commit_dump(pipe, f)
pipe.puts "filename: #{fields.filename}"
pipe.puts "timestamp: #{fields.timestamp.rfc2822}"
pipe.puts "message: #{fields.message}"
pipe.puts "content size: #{fields.content.length}"
pipe.puts
end
def read_file(title)
puts "Reading file #{title}"
mystring = ''
File.open(title, "r") { |f|
mystring = f.read
}
return mystring
end
def imgurl(file)
# If you want to store all files in another dir, uncomment this line:
return "images/" + file
# By default we store all files in the root directory.
#return file
end
# Reads a Mediawiki commit, converts it into an Ikiwiki commit.
#
# An example of what we're parsing:
#
# Main Page
# 1
# 2006-06-15T16:37:31Z
#
# Mediawiki default
#
#
# '''Mediawiki has been successfully installed.'''
#
# Consult the [http://www.mediawiki.org/wiki/Help:Configuration_settings configuration settings list] and the [http://meta.wikipedia.org/wiki/MediaWiki_User%27s_Guide User's Guide] for information on customising and using the wiki software.
#
#
# Here are the fields that can appear in contributors:
# Mediawiki default
# 127.0.0.1
# Bronson2
def parse_revision(node, basedir)
elements = node.elements
title = elements["title"].text
elements = elements["revision"].elements
text = elements["text"].text
if not text
text = ""
end
timestamp = Time.parse(elements["timestamp"].text)
# we'll ignore Mediawiki's minor flag; it seems to be useless.
if true
# Enable this to show the node we're processing
puts "\n\n############"
puts title
puts timestamp
end
# ikiwiki uses two commit message formats:
# Known username? "web commit by USERNAME: COMMENT" (we drop user id)
# Unknown username? "web commit from IPADDR: COMMENT"
# Mediawiki Engine? "web commit from Mediawiki default: COMMENT"
# Of course, it also uses the free-form commit of anybody who
# whacked stuff into the git repo directly.
contributor = elements["contributor"]
message = "web commit "
if contributor.elements["ip"]
message += "by " + contributor.elements["ip"].text
elsif contributor.elements["username"]
message += "from " + contributor.elements["username"].text
else
raise "Could not discover author from " + contributor
end
# This appears to be a limitation of the ikiwiki commit message format
# (it uses a colon to denote the start of the commit message).
throw "User can't contain a colon: #{message}" if message =~ /:/
message += (elements["comment"] ? ": " + elements["comment"].text : "")
# Fix up the Image namespace.
if title =~ /^(Image|File|Media):(.*)$/
# Load the file directly into this commit rather than just referring to it.
title = imgurl($2)
text = read_file("#{basedir}/images/#{title.gsub(' ', '_')}")
else
# Fix the Image namespace
text.gsub!('[[File:', '[['+imgurl(""))
text.gsub!('[[Image:', '[['+imgurl(""))
text.gsub!('[[Media:', '[['+imgurl(""))
# And warn if we see any other weird namespaces in here.
text.scan(/\[\[([^:\]]*:[^\]]*)\]\]/) do |link|
$stderr.puts "Warning: unknown namespace in link \"#{link}\" on page \"#{title}\""
end
# And add the ".mediawiki" extension to all mediawiki-formatted files.
title = title + ".mediawiki"
end
# ikiwiki doesn't handle files with spaces in the names. :(
title.gsub!(" ", "_")
return OpenStruct.new({
:filename => title,
:message => message,
:content => text,
:timestamp => timestamp
})
end
throw "You must supply the name of the file to read!" unless infile = ARGV[0]
throw "You must supply the name of the repo to fill!" unless repo = ARGV[1]
# stupid libxml sax parser can't parse from a filehandle
# so we need to make all our paths absolute before chdiring.
infile = File.expand_path(infile)
basedir = File.expand_path('.')
Dir.chdir(repo) or throw "Could not chdir to #{repo}: #{$!}"
# Create a new git repo if one doesn't already exist
unless File.exists?('.git')
system("git init") or raise "Could not run git init"
end
#branch = "refs/head/master"
branch = `git symbolic-ref HEAD`.chomp
raise "Could not run git-symbolic-ref HEAD??" if branch.strip == ""
# git-fast-import requires us to use the from command if we're importing
# into a pre-existing branch. git-rev-parse will tell us if the branch exists.
putfrom = system("git rev-parse #{branch} >/dev/null 2>&1")
username = `git config user.name`.chomp
raise "You must set your user name. See git config." if username.strip == ""
email = `git config user.email`.chomp
raise "You must set your email address. See git config." if email.strip == ""
IO.popen("git fast-import --date-format=rfc2822 --quiet", "w") do |pipe|
nodeproc = proc { |node|
fields = parse_revision(node, basedir)
puts "importing #{fields.filename} at #{fields.timestamp}"
fields.putfrom = putfrom
# We only want putfrom to be true for the first commit we pass to gfi.
# If we leave it true then only the last commit will be stored in git.
putfrom = false
fields.branch = branch
fields.username = username
fields.email = email
format_git_commit(pipe, fields)
}
parse_node(infile, 'mediawiki/page', nodeproc,
{:compress_whitespace => %w{revision contributor}})
end
# git-fast-import doesn't update the working directory.
# This is good but we must manually update it when we're done.
system("git checkout")