a preliminary version of fahrplan_download.js

This commit is contained in:
Astro 2010-10-11 02:20:44 +02:00
parent dadcc9378b
commit 98017c28c2
1 changed files with 172 additions and 0 deletions

172
scripts/fahrplan_download.js Executable file
View File

@ -0,0 +1,172 @@
#!/usr/bin/env node
var URL = require('url');
var HTTP = require('http');
var FS = require('fs');
var EXPAT = require('node-expat');
var El = require('node-xmpp').Element;
if (process.argv.length !== 5) {
console.error('Usage: <user> <pass> <conference_id>');
process.exit(1);
}
var user = process.argv[2];
var pass = process.argv[3];
var conference_id = process.argv[4];
var outputDir = '.';
var baseDir = '/fahrplan';
var clients = [];
for(var i = 0; i < 8; i++)
clients.push(HTTP.createClient(443, 'cccv.pentabarf.org', true));
function get(path, cb) {
var client = clients[Math.floor(Math.random() * clients.length)];
console.log("GET " + path);
var req = client.request('GET', path + '?preview=1',
{ 'Host': 'cccv.pentabarf.org',
'Authorization': 'Basic ' + base64(user + ':' + pass) });
req.end();
req.on('response', function(res) {
if (res.statusCode === 200)
cb(res);
});
}
function base64(s) {
return new Buffer(s).toString('base64');
}
function ensureDirs(path) {
var paths = path.split('/');
paths.pop();
var path1 = '';
paths.forEach(function(dir) {
if (path1 !== '')
path1 += '/';
path1 += dir;
try {
FS.mkdirSync(path1, 0755);
} catch (e) { }
});
}
function downloadTo(path) {
return function(res) {
ensureDirs(path);
var f = FS.createWriteStream(path);
console.log('>> ' + path);
res.on('data', function(data) {
f.write(data, 'binary');
});
res.on('end', function() {
f.end();
});
};
}
function downloadXml(path, attrMapper) {
return function(res) {
ensureDirs(path);
var f = FS.createWriteStream(path);
console.log('<> ' + path);
var p = new EXPAT.Parser();
var el;
p.addListener('startElement', function(name, attrs) {
var attrs2 = {};
for(var k in attrs) {
if (attrs.hasOwnProperty(k))
attrs2[k] = attrMapper(k, attrs[k]);
}
var child = new El(name, attrs2);
if (el) {
el = el.cnode(child);
} else
el = child;
});
p.addListener('endElement', function(name) {
if (!el.parent) {
f.write(el.toString());
el = undefined;
} else
el = el.parent;
});
p.addListener('text', function(s) {
if (el)
el.t(s);
});
p.addListener('xmlDecl', function(version, encoding, standalone) {
f.write('<?xml');
if (version)
f.write(' version="' + version + "'");
if (encoding)
f.write(' encoding="' + encoding + "'");
if (standalone)
f.write(' standalone="' + standalone + "'");
f.write('?>');
});
res.on('data', function(data) {
p.parse(data);
});
res.on('end', function() {
p.parse('', true);
f.end();
});
};
}
function Spider() {
this.seen = {};
this.see('/schedule/' + conference_id + '/index.de.html');
}
var normalizePath = function(path, from) {
from = from || 'https://cccv.pentabarf.org/';
var uri = URL.parse(URL.resolve('https://cccv.pentabarf.org/', path));
if (uri.hostname !== 'cccv.pentabarf.org')
return null;
return uri.pathname;
};
var manglePath = function(path) {
var m;
if ((m = path.match(/^\/schedule\/\d+\/(.+)$/)))
return baseDir + '/' + m[1];
else
return baseDir + path;
};
Spider.prototype.see = function(path, from) {
path = normalizePath(path, from);
if (!path)
return;
var outPath = outputDir + manglePath(path);
if (this.seen.hasOwnProperty(path))
return;
this.seen[path] = true;
if (/\.html$/.test(path)) {
var that = this;
function rewriteAttr(k, v) {
switch(k) {
case 'href':
case 'src':
that.see(v);
return manglePath(v);
default:
return v;
}
}
get(path, downloadXml(outPath, rewriteAttr));
} else
get(path, downloadTo(outPath));
};
new Spider();