#!/usr/bin/ruby # Extract files from a webarchive file. # Copyright (c) 2009 Takata Yoshiaki # # Usage: webarc.rb [input_file] # # See also: # plist(5) - property list format # (webarchive files are organized according to this format) require "rexml/document" require "nkf" infile = $stdin if (0 < ARGV.size) then fname = ARGV.shift system("plutil", "-convert", "xml1", fname) infile = File.open(fname) end doc = REXML::Document.new(infile) doc.elements.each("//data") { |e| if (e.previous_element.text == "WebResourceData") then data = e.text.gsub(/\s/, '').unpack('m') # decode base64 e.parent.elements.each("key") { |k| if (k.get_text == "WebResourceURL") then name = File.basename(k.next_element.text) name.gsub!(/\%([0-9A-Fa-f]{2})/) { $1.hex.chr } # decode %hh name.gsub!(/\%([0-9A-Fa-f]{2})/) { $1.hex.chr } # doubly decode name = NKF.nkf("-w", name) File.open(name, "w") { |fp| fp.write(data) # write } end } end }