#!/usr/bin/env node var URL = require('url'); var HTTP = require('http'); var FS = require('fs'); var EXPAT = require('node-expat'); var El = require('node-xmpp').Element; if (process.argv.length !== 5) { console.error('Usage: '); process.exit(1); } var user = process.argv[2]; var pass = process.argv[3]; var conference_id = process.argv[4]; var outputDir = '.'; var baseDir = '/fahrplan'; var clients = []; for(var i = 0; i < 8; i++) clients.push(HTTP.createClient(443, 'cccv.pentabarf.org', true)); function get(path, cb) { var client = clients[Math.floor(Math.random() * clients.length)]; console.log("GET " + path); var req = client.request('GET', path + '?preview=1', { 'Host': 'cccv.pentabarf.org', 'Authorization': 'Basic ' + base64(user + ':' + pass) }); req.end(); req.on('response', function(res) { if (res.statusCode === 200) cb(res); else console.warn(res.statusCode + ': ' + path); }); } function base64(s) { return new Buffer(s).toString('base64'); } function ensureDirs(path) { var paths = path.split('/'); paths.pop(); var path1 = ''; paths.forEach(function(dir) { if (path1 !== '') path1 += '/'; path1 += dir; try { FS.mkdirSync(path1, 0755); } catch (e) { } }); } function downloadTo(path) { return function(res) { ensureDirs(path); var f = FS.createWriteStream(path); console.log('>> ' + path); res.on('data', function(data) { f.write(data, 'binary'); }); res.on('end', function() { f.end(); }); }; } function downloadXml(path, attrMapper) { return function(res) { ensureDirs(path); var f = FS.createWriteStream(path); console.log('<> ' + path); var p = new EXPAT.Parser(); var el; p.addListener('startElement', function(name, attrs) { var attrs2 = {}; for(var k in attrs) { if (attrs.hasOwnProperty(k)) attrs2[k] = attrMapper(k, attrs[k]); } var child = new El(name, attrs2); if (el) { el = el.cnode(child); } else el = child; }); p.addListener('endElement', function(name) { if (!el.parent) { f.write(el.toString()); el = undefined; } else el = el.parent; }); p.addListener('text', function(s) { if (el) el.t(s); }); p.addListener('xmlDecl', function(version, encoding, standalone) { f.write('\n"); }); res.on('data', function(data) { p.parse(data); }); res.on('end', function() { p.parse('', true); f.end(); }); }; } function Spider() { this.seen = {}; this.see('/schedule/' + conference_id + '/index.de.html'); } var normalizePath = function(path, from) { from = from || 'https://cccv.pentabarf.org/'; var uri = URL.parse(URL.resolve('https://cccv.pentabarf.org/', path)); if (uri.hostname !== 'cccv.pentabarf.org') return null; return uri.pathname; }; var manglePath = function(path) { var m; if ((m = path.match(/^\/schedule\/\d+\/(.+)$/))) return baseDir + '/' + m[1]; else if ((m = path.match(/^\/(xml|ical|xcal)\/(schedule|conference)\/\d+/))) { var ext; switch(m[1]) { case 'ical': ext = 'ics'; break; case 'xcal': ext = 'xcs'; break; default: ext = m[1]; } return baseDir + '/schedule.' + ext; } else return baseDir + path; }; Spider.prototype.see = function(path, from) { path = normalizePath(path, from); if (!path) return; var outPath = outputDir + manglePath(path); if (this.seen.hasOwnProperty(path)) return; this.seen[path] = true; if (/\.html$/.test(path)) { var that = this; function rewriteAttr(k, v) { switch(k) { case 'href': case 'src': that.see(v); return manglePath(v); default: return v; } } get(path, downloadXml(outPath, rewriteAttr)); } else get(path, downloadTo(outPath)); }; new Spider();