From 98017c28c2cdd2656277c0e9d4d11742d9a11602 Mon Sep 17 00:00:00 2001 From: Astro Date: Mon, 11 Oct 2010 02:20:44 +0200 Subject: [PATCH] a preliminary version of fahrplan_download.js --- scripts/fahrplan_download.js | 172 +++++++++++++++++++++++++++++++++++ 1 file changed, 172 insertions(+) create mode 100755 scripts/fahrplan_download.js diff --git a/scripts/fahrplan_download.js b/scripts/fahrplan_download.js new file mode 100755 index 000000000..bedcf2932 --- /dev/null +++ b/scripts/fahrplan_download.js @@ -0,0 +1,172 @@ +#!/usr/bin/env node + +var URL = require('url'); +var HTTP = require('http'); +var FS = require('fs'); +var EXPAT = require('node-expat'); +var El = require('node-xmpp').Element; + + +if (process.argv.length !== 5) { + console.error('Usage: '); + process.exit(1); +} +var user = process.argv[2]; +var pass = process.argv[3]; +var conference_id = process.argv[4]; +var outputDir = '.'; +var baseDir = '/fahrplan'; + +var clients = []; +for(var i = 0; i < 8; i++) + clients.push(HTTP.createClient(443, 'cccv.pentabarf.org', true)); + +function get(path, cb) { + var client = clients[Math.floor(Math.random() * clients.length)]; + + console.log("GET " + path); + var req = client.request('GET', path + '?preview=1', + { 'Host': 'cccv.pentabarf.org', + 'Authorization': 'Basic ' + base64(user + ':' + pass) }); + req.end(); + + req.on('response', function(res) { + if (res.statusCode === 200) + cb(res); + }); +} + +function base64(s) { + return new Buffer(s).toString('base64'); +} + +function ensureDirs(path) { + var paths = path.split('/'); + paths.pop(); + var path1 = ''; + paths.forEach(function(dir) { + if (path1 !== '') + path1 += '/'; + path1 += dir; + try { + FS.mkdirSync(path1, 0755); + } catch (e) { } + }); +} + +function downloadTo(path) { + return function(res) { + ensureDirs(path); + var f = FS.createWriteStream(path); + console.log('>> ' + path); + res.on('data', function(data) { + f.write(data, 'binary'); + }); + res.on('end', function() { + f.end(); + }); + }; +} + +function downloadXml(path, attrMapper) { + return function(res) { + ensureDirs(path); + var f = FS.createWriteStream(path); + console.log('<> ' + path); + var p = new EXPAT.Parser(); + var el; + p.addListener('startElement', function(name, attrs) { + var attrs2 = {}; + for(var k in attrs) { + if (attrs.hasOwnProperty(k)) + attrs2[k] = attrMapper(k, attrs[k]); + } + + var child = new El(name, attrs2); + if (el) { + el = el.cnode(child); + } else + el = child; + }); + p.addListener('endElement', function(name) { + if (!el.parent) { + f.write(el.toString()); + el = undefined; + } else + el = el.parent; + }); + p.addListener('text', function(s) { + if (el) + el.t(s); + }); + p.addListener('xmlDecl', function(version, encoding, standalone) { + f.write(''); + }); + + res.on('data', function(data) { + p.parse(data); + }); + res.on('end', function() { + p.parse('', true); + f.end(); + }); + }; +} + +function Spider() { + this.seen = {}; + this.see('/schedule/' + conference_id + '/index.de.html'); +} + +var normalizePath = function(path, from) { + from = from || 'https://cccv.pentabarf.org/'; + var uri = URL.parse(URL.resolve('https://cccv.pentabarf.org/', path)); + if (uri.hostname !== 'cccv.pentabarf.org') + return null; + return uri.pathname; +}; + +var manglePath = function(path) { + var m; + if ((m = path.match(/^\/schedule\/\d+\/(.+)$/))) + return baseDir + '/' + m[1]; + else + return baseDir + path; +}; + +Spider.prototype.see = function(path, from) { + path = normalizePath(path, from); + if (!path) + return; + var outPath = outputDir + manglePath(path); + + if (this.seen.hasOwnProperty(path)) + return; + this.seen[path] = true; + + if (/\.html$/.test(path)) { + var that = this; + function rewriteAttr(k, v) { + switch(k) { + case 'href': + case 'src': + that.see(v); + return manglePath(v); + default: + return v; + } + } + get(path, downloadXml(outPath, rewriteAttr)); + } else + get(path, downloadTo(outPath)); +}; + +new Spider(); +