From 5e9d603cedc9a7d727b5be9cec0ee378c7f6ca9e Mon Sep 17 00:00:00 2001 From: Astro Date: Wed, 13 Oct 2021 17:46:17 +0200 Subject: [PATCH] mkz-programm: init --- mkz-programm/scrape.rb | 87 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 87 insertions(+) create mode 100644 mkz-programm/scrape.rb diff --git a/mkz-programm/scrape.rb b/mkz-programm/scrape.rb new file mode 100644 index 0000000..76ad4c0 --- /dev/null +++ b/mkz-programm/scrape.rb @@ -0,0 +1,87 @@ +#!/usr/bin/env ruby +# coding: utf-8 + +require 'uri' +require 'open-uri' +require 'nokogiri' +require 'erb' + +def fetch_doc url + STDERR.puts "GET #{url}" + Nokogiri::HTML URI.open(url) +end + +def fmt_time t + t.strftime "%Y%m%dT%H%M%S" +end + +class Event + attr_accessor :title, :start, :end, :location, :url, :image +end + +events = [] + +url = "https://www.medienkulturzentrum.de/angebote/" +while url + list_doc = fetch_doc url + list_doc.css("a[text()='weiterlesen']").each do |a| + ev = Event::new + ev.url = URI.join url, a.attr('href') + doc = fetch_doc ev.url + + date = doc.css('section.page-document header strong')[0] + date = date ? date.parent.text : next + if date =~ /(\d+)\.(\d+)\.(\d+),\s*(\d+):(\d+)\s*-\s*(\d+)\.(\d+)\.(\d+),\s*(\d+):(\d+)/ + ev.start = Time::local $3.to_i, $2.to_i, $1.to_i, $4.to_i, $5.to_i, 0 + ev.end = Time::local $8.to_i, $7.to_i, $6.to_i, $9.to_i, $10.to_i, 0 + elsif date =~ /(\d+)\.(\d+)\.(\d+),\s*(\d+):(\d+)\s*-\s*(\d+):(\d+)/ + ev.start = Time::local $3.to_i, $2.to_i, $1.to_i, $4.to_i, $5.to_i, 0 + ev.end = Time::local $3.to_i, $2.to_i, $1.to_i, $6.to_i, $7.to_i, 0 + elsif date =~ /(\d+)\.(\d+)\.(\d+)\s*-\s*(\d+)\.(\d+)\.(\d+)/ + ev.start = Time::local $3.to_i, $2.to_i, $1.to_i, 0, 0, 0 + ev.end = Time::local $6.to_i, $5.to_i, $4.to_i, 23, 59, 59 + else + puts "Unrecognized date: #{date}" + next + end + ev.title = doc.css('section.page-document header h1').text + ev.location = doc.css('section.page-document address').text + .lines + .collect { |s| s.strip } + .filter { |s| not s.empty? } + .join(", ") + ev.image = doc.css('section.page-document article img[1]').attr('src') + + events << ev + end + + next_page = list_doc.css(".wp-pagenavi .current + .page")[0] + url = if next_page + URI.join url, next_page.attr('href') + else + nil + end +end + +ical = ERB::new <<~EOF + BEGIN:VCALENDAR + VERSION:2.0 + METHOD:PUBLISH + X-WR-TIMEZONE;VALUE=TEXT:Europe/Berlin + <% events.each do |ev| %> + BEGIN:VEVENT + METHOD:PUBLISH + CLASS:PUBLIC + UID:<%= ev.url %> + DTSTART:<%= fmt_time(ev.start) %> + DTEND:<%= fmt_time(ev.end) %> + SUMMARY:<%= ev.title %> + LOCATION:<%= ev.location %> + URL:<%= ev.url %> + ATTACH;FMTTYPE=image/jpeg:<%= ev.image %> + END:VEVENT + <% end %> + END:VCALENDAR +EOF + +puts ical.result