c3d2-wiki/DVB-Scraping.mw

483 lines
14 KiB
Plaintext

[[Kategorie:Projekt]]
=Ruby=
==Telnet-Haltestellenmonitor==
{{Broken|
Reason=Die Struktur der WAP-Seiten war nicht permanent und wurde in der Zwischenzeit geändert. Jemand muss auf die neuen URLs anpassen.|}}
<source lang="ruby">#!/usr/bin/env ruby
require 'net/http'
require 'socket'
require 'rexml/document'
class MultipleStations < RuntimeError
def initialize(stations)
@stations = stations
end
def to_s
"Mehrere mögliche Haltestellen: " + @stations.join(', ')
end
end
class StationResult
def initialize(card)
strong_n = 0
card.each_element('p/strong') { |e|
case strong_n
when 0
@name = e.text
when 1
@time = e.text
end
strong_n += 1
}
@trams = []
card.to_s.scan(/br\/>(\d+:\d+\*?) (.+?)<br\/>-> (.+?)</) { |time,tram,direction|
@trams << [time, tram, direction]
}
end
def to_s
column_widths = [4, 5, 4]
@trams.each { |a|
a.each_with_index { |b,i|
column_widths[i] = b.size if b.size > column_widths[i]
}
}
"\n\n#{@name}, #{@time}:\n\n" +
'Zeit'.ljust(column_widths[0]) + ' | ' +
'Linie'.ljust(column_widths[1]) + ' | ' +
'Ziel'.ljust(column_widths[2]) + "\n" +
('-' * column_widths[0]) + '-+-' +
('-' * column_widths[1]) + '-+-' +
('-' * column_widths[2]) + "\n" +
@trams.collect { |time,tram,direction|
time.ljust(column_widths[0]) + ' | ' +
tram.ljust(column_widths[1]) + ' | ' +
direction.ljust(column_widths[2])
}.join("\n")
end
end
class ClientHandler
def initialize(socket)
@socket = socket
puts "#{address} connected"
Thread.new {
begin
handle
rescue Exception => e
@socket.puts("Fehler: #{e}")
ensure
@socket.close
end
}
end
def address
if @socket.peeraddr[0] == 'AF_INET6'
"[#{@socket.peeraddr[3]}]"
else
"#{@socket.peeraddr[3]}"
end +
":#{@socket.peeraddr[1]}"
end
def ask_haltestellenmonitor(station)
param = { :station => station,
:action => :check,
:time => Time.new.strftime('%H:%M'),
:date => Time.new.strftime('%d.%m.%Y')
}
param_s = param.collect { |k,v| "#{k}=#{v}" }.join('&')
param_s.gsub!(/ /, '+')
res = Net::HTTP.start('wap.dvbag.de') { |http|
http.get('/wapVVO/wap-rbl.php?' + param_s)
}
if res.kind_of? Net::HTTPSuccess
wml = REXML::Document.new(res.body).root
card = nil
wml.each_element('/wml/card') { |c| card = c }
if card
if card.attributes['id'] == 'liste'
stations = []
card.each_element('p/select/option') { |option|
stations << option.text
}
raise MultipleStations.new(stations)
elsif card.attributes['id'] == 'result'
StationResult.new(card).to_s
else
raise "Unexpected card/@id: #{card.attributes['id']}"
end
else
raise "No card found in result document"
end
else
raise "#{res.class}"
end
end
def handle
@socket.print "Hallo #{address}\n\nHaltestelle: "
@socket.flush
haltestelle = @socket.gets
if haltestelle
haltestelle.strip!
puts "#{address} asks for #{haltestelle.inspect}"
@socket.puts "Anfrage nach #{haltestelle}..."
@socket.puts ask_haltestellenmonitor(haltestelle)
end
end
end
serv = TCPServer.new('0.0.0.0', 65023)
while client = serv.accept
ClientHandler.new(client)
end
</source>
Und dann:
telnet localhost 65023
==NCurses Monitor==
Der VVO stellt jetzt ja für seine Widgets eine JSON(?)-Variante der Daten zur Verfügung.
<source lang="ruby">
require 'net/http'
require 'ncurses'
class DvbAbfahrt
def initialize
@BASEURI = "http://widgets.vvo-online.de/abfahrtsmonitor/Abfahrten.do?ort=ORT&hst=HST&vz=VZ"
end
def fetch(ort, hst, vz=0)
# TODO exceptionhandling: timeout
vz = vz.to_s
ort = URI.escape ort
hst = URI.escape hst
uri = @BASEURI.gsub(/ORT/, ort).gsub(/HST/,hst).gsub(/VZ/, vz)
res = Net::HTTP.get(URI.parse(uri))
res = umlauts_hack res
arr = res.scan(/(\d+),([^,]{1,}),(\d+)/)
end
def umlauts_hack(s)
repl = [
["&quot;", ''],
["&#252;", "ü"],
["&#246;", "ö"],
["&#223;", "ß"]
]
repl.each do |r|
s.gsub!(r[0], r[1])
end
return s
end
end
class Monitor
def initialize(ort, hst, vz=0)
Ncurses::initscr
Ncurses::start_color
Ncurses::init_pair(1, Ncurses::COLOR_YELLOW, Ncurses::COLOR_BLACK);
Ncurses::attron(Ncurses::COLOR_PAIR(1));
Ncurses::curs_set(0)
Ncurses::move(0,0)
Ncurses::printw "Loading..."
Ncurses::refresh
@sep = "|"
@timeout = 30
@wlno = 3
@wlname = 20
@weta = 3
@ort = ort
@hst = hst
@vz = vz
@dvb = DvbAbfahrt.new
@lines = 0
end
def print_title
Ncurses::attron(Ncurses::A_REVERSE | Ncurses::A_BOLD);
Ncurses::mvprintw(0,0, @hst.center(@wlno+@wlname+@weta+2)) # +2 seperators
Ncurses::attroff(Ncurses::A_REVERSE | Ncurses::A_BOLD);
end
def mainloop
print_title
while true
info = @dvb.fetch(@ort, @hst, @vz)
# clearing old lines if needed
if @lines >= info.size
Ncurses::clear
print_title
end
@lines = info.size
info.each_index do |i|
Ncurses::mvprintw(i+1, 0, info[i][0].rjust(@wlno) + @sep + info[i][1][0..@wlname].center(@wlname) + @sep + info[i][2].rjust(@weta))
end
Ncurses::refresh
Ncurses::move(0,0)
sleep @timeout
end
end
end
if __FILE__ == $0
begin
if !ARGV.empty?
m = Monitor.new("Dresden", ARGV[0])
m.mainloop
else
puts "USAGE #{$0} <HALTESTELLENNAME>"
end
ensure
Ncurses::attroff(Ncurses::COLOR_PAIR(1));
Ncurses::endwin
end
end
</source>
==awk (gawk!)==
1.) DVB Stationsmonitor, Usage: ./pnv [station] (z.B. ./pnv Bischofsweg)
<source lang="bash">
#!/bin/bash
station=${@:-Mockritzer Strasse}
awk -vRS='\\],\\[|\r\n\r\n' -vFS='&quot;(,&quot;)*' -vstation="${station/ /%20}" 'BEGIN { s="/inet/tcp/0/widgets.vvo-online.de/80"; print "GET /abfahrtsmonitor/Abfahrten.do?ort=Dresden&hst=" station "&vz=5 HTTP/1.1\r\nHost: widgets.vvo-online.de\r\nAccept: */*\r\nConnection: close\r\n\r\n" |& s; print "Nr. Min. Richtung"; while (s |& getline) { if ($0 ~ /quot/) { printf ("%3s %4s %s\n", $2, $4, $3); } } }' | perl -npe 's/&#(\d*);/($1 & ~0x7f ? chr(0xc0 | (($1 >> 6) & 0x3f)).chr(0x80 | ($1 & 0x3f)) : chr($1))/eg'
</source>
Und so sieht es aus:
<source lang="bash">
$ ./pnv
Nr. Min. Richtung
75 6 Pirnaischer Pl.
13 10 Mickten
13 11 Prohlis
75 12 Leubnitzer Höhe
9 13 Prohlis
9 13 Kaditz
9 14 Kaditz
75 15 Messe Dresden
89 15 Löbtau
9 15 Prohlis
</source>
2.) DVB Routenplanung, Usage: ./t [destination [station]] (z.B. `./t "Max Muster Strasse" HBF', `./t HBF' oder `./t')
<source lang="bash">
#!/bin/bash
dststation=${1:-My default target}
srcstation=${2:-My default station}
awk -vrequest="http://efa.vvo-online.de:8080/dvb/XSLT_TRIP_REQUEST2?sessionID=0&requestID=0&language=de&usage=xslt_trip&execInst=normal&command=&ptOptionsActive=-1&itOptionsActive=&itDateDay=`date +%d`&itDateMonth=`date +%m`&itDateYear=`date +%y`&place_origin=Dresden&placeState_origin=empty&type_origin=stop&name_origin=${srcstation/ /%20}&nameState_origin=empty&place_destination=Dresden&placeState_destination=empty&type_destination=stop&name_destination=${dststation/ /%20}&nameState_destination=empty&itdTripDateTimeDepArr=dep&itdTimeHour=`date +%H`&idtTimeMinute=`date +%M`" -vRS="<tr[^>]*>|</tr>" -vFS='<td[^>]*>|</td>|\\("|"\\)' 'BEGIN { isDest=0; route=0; curr=0; s="/inet/tcp/0/efa.vvo-online.de/8080"; print "GET " request " HTTP/1.1\r\nHost: efa.vvo-online.de\r\nUser-Agent: akts!zr\r\nAccept: */*\r\nConnection: close\r\n\r\n" |& s; while (s |& getline) { if($0 ~ /option value="[0-9]+:[0-9]+"/) { if($0 ~ /name_destination/) isDest = 1; split($0, a, /<option[^>]*>|<\/option>/); if(isDest) for(pos = 2; a[pos]; pos += 2) dest = dest (dest ? ", " : "") a[pos]; else for(pos = 2; a[pos]; pos += 2) src = src (src ? ", " : "") a[pos]; continue; } if($0 ~ /#ROUTE/) { split($0, a, /<a[^>]*>|<\/a>/); routes[++route]=a[2]; continue; } if($0 ~ /"ROUTE_[0-9]"/) { print routes[++curr]; print " Zeit Linie Station"; continue; } if($2 ~ /^[0-9][0-9]:[0-9][0-9]/) { split($11, a, / /); line=a[2]; printf (" %s %5s %s\n", $2, line, $4 " " $6); } } if(src || dest) { print "Gehts etwas genauer?"; if(src) print "Einstieg: " src "?"; if(dest) print "Ausstieg: " dest "?"; } }' | perl -npe 's/([\x80-\xff])/(chr(0xc0 | ((ord($1) >> 6) & 0x3f)).chr(0x80 | (ord($1) & 0x3f)))/eg'
</source>
Und so sieht es aus:
<source lang="bash">
$ ./t Terminal Mock
Gehts etwas genauer?
Einstieg: Campingplatz Mockritz, Mockethaler Straße, Mockritz, Mockritzer Straße?
$ ./t Terminal "Mockritzer Str"
1. Fahrt am 01.12.2008 18:40 - 19:44 Uhr
Zeit Linie Station
18:40 13 ab Dresden Mockritzer Straße
19:02 an Dresden Bischofsweg
19:05 7 ab Dresden Bischofsweg
19:16 an Dresden Infineon Nord
19:26 77 ab Dresden Infineon Nord
19:32 an Dresden Flughafen
2. Fahrt am 01.12.2008 19:05 - 19:49 Uhr
Zeit Linie Station
19:05 75 ab Dresden Mockritzer Straße
19:08 an Dresden Hp Strehlen
19:13 S ab Dresden-Strehlen
19:40 an Dresden Flughafen
19:48 77 ab Dresden Flughafen
19:49 an Dresden Flughafen Terminal 1
3. Fahrt am 01.12.2008 19:05 - 19:57 Uhr
...
</source>
=Python=
==CLI-Interface==
<source lang="python">
#!/usr/bin/python
import sys
from urllib import urlencode, urlopen
from optparse import OptionParser
from BeautifulSoup import BeautifulStoneSoup
import simplejson
widgets_base_url = "http://widgets.vvo-online.de/abfahrtsmonitor/"
def get_connections(stop=None, town=None, time=None):
"""
Get the next connections at *stop* in *town* *time* minutes from now.
"""
query_params = []
if stop is not None:
query_params.append(("hst", stop))
if town is not None:
query_params.append(("ort", town))
if time is not None:
query_params.append(("vz", time))
query_url = widgets_base_url + "Abfahrten.do?" + urlencode(query_params)
page_data = urlopen(query_url).read()
connections_soup = BeautifulStoneSoup(page_data, convertEntities="html")
connections_data = connections_soup.contents[0]
connections = simplejson.loads(connections_data)
return connections
def find_stops(stop, town=None):
"""
Get stops with the given name in *town*.
"""
query_params = [("hst", stop)]
if town is not None:
query_params.append(("ort", town))
query_url = widgets_base_url + "Haltestelle.do?" + urlencode(query_params)
page_data = urlopen(query_url).read()
stops_soup = BeautifulStoneSoup(page_data, convertEntities="html")
stops_data = stops_soup.contents[0]
towns, stops = simplejson.loads(stops_data)
return towns, stops
def format_connections(connections):
"""
Format a list of connections into a nice table. Returns a generator for
table's rows.
"""
destination_column_length = max(23, *(len(d) for _, d, _ in connections))
line_name_column_length = max(5, *(len(l) for l, _, _ in connections))
line_format = "%-" + str(line_name_column_length) + "s | %" \
+ str(destination_column_length) + "s | %7s"
header_line = line_format % ("line", "destination", "arrival")
yield header_line
yield "-" * line_name_column_length + "-+-" \
+ "-" * destination_column_length + "-+-" \
+ "-" * 7
for line_name, destination, time in connections:
yield line_format % (line_name, destination, time)
def print_connections(stop, town, connections, limit=None):
"""
Print *connections* at *stop* in *town* to stdout. If a limit is given
only that many connections are printed otherwise all.
"""
if len(connections) == 0:
print "No connections at %s in %s." % (stop, town)
sys.exit(1)
if town is not None:
print "Next connections at %s in %s:" % (stop, town)
else:
print "Next connections at %s:" % (stop,)
print
if limit:
connections_table = format_connections(connections[:limit])
else:
connections_table = format_connections(connections)
for line in connections_table:
print line
def main():
"""
Main function.
"""
option_parser = OptionParser(
usage="%prog [options] [<town>] <stop>")
option_parser.add_option("-l", "--limit",
help="maximum number of connections to display",
type="int",
default=5)
option_parser.add_option("-t", "--time",
help="minimum time to departure",
type="int",
default=None)
option_parser.add_option("-k", "--no-lookup",
help="do not look up stop name",
action="store_false",
dest="lookup_stop",
default=True)
options, args = option_parser.parse_args()
# sanitize options
if options.limit < 0:
options.limit = None
if options.time < 0:
options.time = None
if len(args) == 1:
stop = args[0]
town = None
elif len(args) == 2:
town, stop = args
else:
option_parser.error("Not enough arguments")
if options.lookup_stop:
towns, stops = find_stops(stop, town)
if len(towns) == 0:
print "No town named '%s'." % (town,)
sys.exit(1)
if len(stops) == 0:
print "No stop named '%s' in the following towns:" % (stop,)
print "\n".join(" "+t[0] for t in towns)
sys.exit(1)
for stop_name, town, stop_id in stops:
connections = get_connections(stop_id, time=options.time)
print_connections(stop_name, town, connections, options.limit)
print
else:
connections = get_connections(stop, town, options.time)
print_connections(stop, town, connections, options.limit)
if __name__ == "__main__":
main()
</source>
[[Kategorie:Ruby]]
{{Rübÿ Spëëd Mëtäl Cödïng}}