#!/usr/bin/env node var URL = require('url'); var HTTPS = require('http'); var FS = require('fs'); var EXPAT = require('node-expat'); var El = require('node-xmpp').Element; var async = require('async'); var spawn = require('child_process').spawn; if (process.argv.length !== 5) { console.error('Usage: '); process.exit(1); } var user = process.argv[2]; var pass = process.argv[3]; var conference_id = process.argv[4]; var outputDir = '.'; var baseDir = '/2012/fahrplan'; var getQueue = async.queue(function(task, cb) { console.log("GET", task.path); var curl = spawn('curl', ["-u", user + ':' + pass, "https://cccv.pentabarf.org" + task.path]); task.resCb(curl.stdout); curl.on('exit', cb); }, 4); function get(path, cb) { getQueue.push({ path: path, resCb: cb }); } function base64(s) { return new Buffer(s).toString('base64'); } function ensureDirs(path) { var paths = path.split('/'); paths.pop(); var path1 = ''; paths.forEach(function(dir) { if (path1 !== '') path1 += '/'; path1 += dir; try { FS.mkdirSync(path1, 0755); } catch (e) { } }); } function downloadTo(path) { return function(res) { ensureDirs(path); var f = FS.createWriteStream(path); console.log('>> ' + path); res.on('data', function(data) { f.write(data, 'binary'); }); res.on('end', function() { f.end(); }); }; } var NON_EMPTY_ELS = ["textarea", "ul", "ol"]; function downloadXml(path, attrMapper) { return function(res) { ensureDirs(path); var f = FS.createWriteStream(path); console.log('<> ' + path); var p = new EXPAT.Parser(); var el; p.addListener('startElement', function(name, attrs) { var attrs2 = {}; for(var k in attrs) { if (attrs.hasOwnProperty(k)) attrs2[k] = attrMapper(k, attrs[k]); } var child = new El(name, attrs2); if (el) { el = el.cnode(child); } else el = child; }); p.addListener('endElement', function(name) { if (NON_EMPTY_ELS.indexOf(name) >= 0 && el.children.length == 0) // prevent some empty elements el.t(''); if (!el.parent) { f.write(el.toString()); el = undefined; } else el = el.parent; }); p.addListener('text', function(s) { if (el) el.t(s); }); p.addListener('xmlDecl', function(version, encoding, standalone) { f.write('\n"); }); res.on('data', function(data) { p.parse(data); }); res.on('end', function() { p.parse('', true); f.end(); }); }; } function Spider() { this.seen = {}; this.see('/schedule/' + conference_id + '/index.de.html'); } /* link to download location */ function normalizePath(path, from) { from = from || 'https://cccv.pentabarf.org/'; var uri = URL.parse(URL.resolve('https://cccv.pentabarf.org/', path)); if (uri.hostname !== 'cccv.pentabarf.org') return null; return uri.pathname; }; /* link rewriting for xhtml files */ function manglePath(path) { var m; if ((m = path.match(/^\/schedule\/\d+\/(.+)$/))) { return baseDir + '/' + m[1]; } else if ((m = path.match(/^\/(xml|ical|xcal)\/(schedule|conference)\/\d+/))) { var ext; switch(m[1]) { case 'ical': ext = 'ics'; break; case 'xcal': ext = 'xcs'; break; default: ext = m[1]; } return baseDir + '/schedule.' + ext; } else if ((m = path.match(/datenspuren\.de\/fahrplan\/(.*)$/))) { var f = m[1]; f = f.replace(/day_(.+)\.html/, 'day/$1.html'); return '/fahrplan/' + f; } else if (path.indexOf('://') >= 0) return path; else return baseDir + path; }; Spider.prototype.see = function(path, from) { path = normalizePath(path, from); if (!path) return; var outPath = outputDir + manglePath(path); if (this.seen.hasOwnProperty(path)) return; this.seen[path] = true; if (/\.html$/.test(path)) { var that = this; function rewriteAttr(k, v) { switch(k) { case 'href': case 'src': that.see(v); return manglePath(v); case 'action': return (v.indexOf('://') >= 0) ? v : 'https://cccv.pentabarf.org' + v; default: return v; } } get(path, downloadXml(outPath, rewriteAttr)); } else { // just binary download try { FS.statSync(outPath); console.log('.. '+outPath); } catch(e) { // statSync() raised ENOENT? get(path, downloadTo(outPath)); } } }; new Spider();