204 lines
4.5 KiB
JavaScript
Executable File
204 lines
4.5 KiB
JavaScript
Executable File
#!/usr/bin/env node
|
|
|
|
var URL = require('url');
|
|
var HTTPS = require('http');
|
|
var FS = require('fs');
|
|
var EXPAT = require('node-expat');
|
|
var El = require('node-xmpp').Element;
|
|
var async = require('async');
|
|
var spawn = require('child_process').spawn;
|
|
|
|
|
|
if (process.argv.length !== 5) {
|
|
console.error('Usage: <user> <pass> <conference_id>');
|
|
process.exit(1);
|
|
}
|
|
var user = process.argv[2];
|
|
var pass = process.argv[3];
|
|
var conference_id = process.argv[4];
|
|
var outputDir = '.';
|
|
var baseDir = '/2012/fahrplan';
|
|
|
|
var getQueue = async.queue(function(task, cb) {
|
|
console.log("GET", task.path);
|
|
var curl = spawn('curl', ["-u", user + ':' + pass, "https://cccv.pentabarf.org" + task.path]);
|
|
task.resCb(curl.stdout);
|
|
curl.on('exit', cb);
|
|
}, 4);
|
|
function get(path, cb) {
|
|
getQueue.push({ path: path, resCb: cb });
|
|
}
|
|
|
|
function base64(s) {
|
|
return new Buffer(s).toString('base64');
|
|
}
|
|
|
|
function ensureDirs(path) {
|
|
var paths = path.split('/');
|
|
paths.pop();
|
|
var path1 = '';
|
|
paths.forEach(function(dir) {
|
|
if (path1 !== '')
|
|
path1 += '/';
|
|
path1 += dir;
|
|
try {
|
|
FS.mkdirSync(path1, 0755);
|
|
} catch (e) { }
|
|
});
|
|
}
|
|
|
|
function downloadTo(path) {
|
|
return function(res) {
|
|
ensureDirs(path);
|
|
var f = FS.createWriteStream(path);
|
|
console.log('>> ' + path);
|
|
res.on('data', function(data) {
|
|
f.write(data, 'binary');
|
|
});
|
|
res.on('end', function() {
|
|
f.end();
|
|
});
|
|
};
|
|
}
|
|
|
|
var NON_EMPTY_ELS = ["textarea", "ul", "ol"];
|
|
|
|
function downloadXml(path, attrMapper) {
|
|
return function(res) {
|
|
ensureDirs(path);
|
|
var f = FS.createWriteStream(path);
|
|
console.log('<> ' + path);
|
|
var p = new EXPAT.Parser();
|
|
var el;
|
|
p.addListener('startElement', function(name, attrs) {
|
|
var attrs2 = {};
|
|
for(var k in attrs) {
|
|
if (attrs.hasOwnProperty(k))
|
|
attrs2[k] = attrMapper(k, attrs[k]);
|
|
}
|
|
|
|
var child = new El(name, attrs2);
|
|
if (el) {
|
|
el = el.cnode(child);
|
|
} else
|
|
el = child;
|
|
});
|
|
p.addListener('endElement', function(name) {
|
|
if (NON_EMPTY_ELS.indexOf(name) >= 0 &&
|
|
el.children.length == 0)
|
|
// prevent some empty elements
|
|
el.t('');
|
|
|
|
if (!el.parent) {
|
|
f.write(el.toString());
|
|
el = undefined;
|
|
} else
|
|
el = el.parent;
|
|
});
|
|
p.addListener('text', function(s) {
|
|
if (el)
|
|
el.t(s);
|
|
});
|
|
p.addListener('xmlDecl', function(version, encoding, standalone) {
|
|
f.write('<?xml');
|
|
if (version)
|
|
f.write(' version="' + version + '"');
|
|
if (encoding)
|
|
f.write(' encoding="' + encoding + '"');
|
|
if (standalone)
|
|
f.write(' standalone="' + (standalone ? 'yes' : 'no') + '"');
|
|
f.write("?>\n");
|
|
});
|
|
|
|
res.on('data', function(data) {
|
|
p.parse(data);
|
|
});
|
|
res.on('end', function() {
|
|
p.parse('', true);
|
|
f.end();
|
|
});
|
|
};
|
|
}
|
|
|
|
function Spider() {
|
|
this.seen = {};
|
|
this.see('/schedule/' + conference_id + '/index.de.html');
|
|
}
|
|
|
|
/* link to download location */
|
|
function normalizePath(path, from) {
|
|
from = from || 'https://cccv.pentabarf.org/';
|
|
var uri = URL.parse(URL.resolve('https://cccv.pentabarf.org/', path));
|
|
if (uri.hostname !== 'cccv.pentabarf.org')
|
|
return null;
|
|
return uri.pathname;
|
|
};
|
|
|
|
/* link rewriting for xhtml files */
|
|
function manglePath(path) {
|
|
var m;
|
|
if ((m = path.match(/^\/schedule\/\d+\/(.+)$/))) {
|
|
return baseDir + '/' + m[1];
|
|
} else if ((m = path.match(/^\/(xml|ical|xcal)\/(schedule|conference)\/\d+/))) {
|
|
var ext;
|
|
switch(m[1]) {
|
|
case 'ical':
|
|
ext = 'ics';
|
|
break;
|
|
case 'xcal':
|
|
ext = 'xcs';
|
|
break;
|
|
default:
|
|
ext = m[1];
|
|
}
|
|
return baseDir + '/schedule.' + ext;
|
|
} else if ((m = path.match(/datenspuren\.de\/fahrplan\/(.*)$/))) {
|
|
var f = m[1];
|
|
f = f.replace(/day_(.+)\.html/, 'day/$1.html');
|
|
return '/fahrplan/' + f;
|
|
} else if (path.indexOf('://') >= 0)
|
|
return path;
|
|
else
|
|
return baseDir + path;
|
|
};
|
|
|
|
Spider.prototype.see = function(path, from) {
|
|
path = normalizePath(path, from);
|
|
if (!path)
|
|
return;
|
|
var outPath = outputDir + manglePath(path);
|
|
|
|
if (this.seen.hasOwnProperty(path))
|
|
return;
|
|
this.seen[path] = true;
|
|
|
|
if (/\.html$/.test(path)) {
|
|
var that = this;
|
|
function rewriteAttr(k, v) {
|
|
switch(k) {
|
|
case 'href':
|
|
case 'src':
|
|
that.see(v);
|
|
return manglePath(v);
|
|
case 'action':
|
|
return (v.indexOf('://') >= 0) ? v : 'https://cccv.pentabarf.org' + v;
|
|
default:
|
|
return v;
|
|
}
|
|
}
|
|
get(path, downloadXml(outPath, rewriteAttr));
|
|
} else {
|
|
// just binary download
|
|
try {
|
|
FS.statSync(outPath);
|
|
console.log('.. '+outPath);
|
|
} catch(e) {
|
|
// statSync() raised ENOENT?
|
|
get(path, downloadTo(outPath));
|
|
}
|
|
}
|
|
};
|
|
|
|
new Spider();
|
|
|