c3d2-web/scripts/fahrplan_download.js

204 lines
4.5 KiB
JavaScript
Executable File

#!/usr/bin/env node
var URL = require('url');
var HTTPS = require('http');
var FS = require('fs');
var EXPAT = require('node-expat');
var El = require('node-xmpp').Element;
var async = require('async');
var spawn = require('child_process').spawn;
if (process.argv.length !== 5) {
console.error('Usage: <user> <pass> <conference_id>');
process.exit(1);
}
var user = process.argv[2];
var pass = process.argv[3];
var conference_id = process.argv[4];
var outputDir = '.';
var baseDir = '/2012/fahrplan';
var getQueue = async.queue(function(task, cb) {
console.log("GET", task.path);
var curl = spawn('curl', ["-u", user + ':' + pass, "https://cccv.pentabarf.org" + task.path]);
task.resCb(curl.stdout);
curl.on('exit', cb);
}, 4);
function get(path, cb) {
getQueue.push({ path: path, resCb: cb });
}
function base64(s) {
return new Buffer(s).toString('base64');
}
function ensureDirs(path) {
var paths = path.split('/');
paths.pop();
var path1 = '';
paths.forEach(function(dir) {
if (path1 !== '')
path1 += '/';
path1 += dir;
try {
FS.mkdirSync(path1, 0755);
} catch (e) { }
});
}
function downloadTo(path) {
return function(res) {
ensureDirs(path);
var f = FS.createWriteStream(path);
console.log('>> ' + path);
res.on('data', function(data) {
f.write(data, 'binary');
});
res.on('end', function() {
f.end();
});
};
}
var NON_EMPTY_ELS = ["textarea", "ul", "ol"];
function downloadXml(path, attrMapper) {
return function(res) {
ensureDirs(path);
var f = FS.createWriteStream(path);
console.log('<> ' + path);
var p = new EXPAT.Parser();
var el;
p.addListener('startElement', function(name, attrs) {
var attrs2 = {};
for(var k in attrs) {
if (attrs.hasOwnProperty(k))
attrs2[k] = attrMapper(k, attrs[k]);
}
var child = new El(name, attrs2);
if (el) {
el = el.cnode(child);
} else
el = child;
});
p.addListener('endElement', function(name) {
if (NON_EMPTY_ELS.indexOf(name) >= 0 &&
el.children.length == 0)
// prevent some empty elements
el.t('');
if (!el.parent) {
f.write(el.toString());
el = undefined;
} else
el = el.parent;
});
p.addListener('text', function(s) {
if (el)
el.t(s);
});
p.addListener('xmlDecl', function(version, encoding, standalone) {
f.write('<?xml');
if (version)
f.write(' version="' + version + '"');
if (encoding)
f.write(' encoding="' + encoding + '"');
if (standalone)
f.write(' standalone="' + (standalone ? 'yes' : 'no') + '"');
f.write("?>\n");
});
res.on('data', function(data) {
p.parse(data);
});
res.on('end', function() {
p.parse('', true);
f.end();
});
};
}
function Spider() {
this.seen = {};
this.see('/schedule/' + conference_id + '/index.de.html');
}
/* link to download location */
function normalizePath(path, from) {
from = from || 'https://cccv.pentabarf.org/';
var uri = URL.parse(URL.resolve('https://cccv.pentabarf.org/', path));
if (uri.hostname !== 'cccv.pentabarf.org')
return null;
return uri.pathname;
};
/* link rewriting for xhtml files */
function manglePath(path) {
var m;
if ((m = path.match(/^\/schedule\/\d+\/(.+)$/))) {
return baseDir + '/' + m[1];
} else if ((m = path.match(/^\/(xml|ical|xcal)\/(schedule|conference)\/\d+/))) {
var ext;
switch(m[1]) {
case 'ical':
ext = 'ics';
break;
case 'xcal':
ext = 'xcs';
break;
default:
ext = m[1];
}
return baseDir + '/schedule.' + ext;
} else if ((m = path.match(/datenspuren\.de\/fahrplan\/(.*)$/))) {
var f = m[1];
f = f.replace(/day_(.+)\.html/, 'day/$1.html');
return '/fahrplan/' + f;
} else if (path.indexOf('://') >= 0)
return path;
else
return baseDir + path;
};
Spider.prototype.see = function(path, from) {
path = normalizePath(path, from);
if (!path)
return;
var outPath = outputDir + manglePath(path);
if (this.seen.hasOwnProperty(path))
return;
this.seen[path] = true;
if (/\.html$/.test(path)) {
var that = this;
function rewriteAttr(k, v) {
switch(k) {
case 'href':
case 'src':
that.see(v);
return manglePath(v);
case 'action':
return (v.indexOf('://') >= 0) ? v : 'https://cccv.pentabarf.org' + v;
default:
return v;
}
}
get(path, downloadXml(outPath, rewriteAttr));
} else {
// just binary download
try {
FS.statSync(outPath);
console.log('.. '+outPath);
} catch(e) {
// statSync() raised ENOENT?
get(path, downloadTo(outPath));
}
}
};
new Spider();