From d8d9feac7a6b39dbc2273b391e046fb85aa29d6d Mon Sep 17 00:00:00 2001 From: Astro Date: Sun, 9 Jul 2006 22:37:38 +0000 Subject: [PATCH] Proof of concept --- Filmnächte-Scraping.mw | 50 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 50 insertions(+) create mode 100644 Filmnächte-Scraping.mw diff --git a/Filmnächte-Scraping.mw b/Filmnächte-Scraping.mw new file mode 100644 index 00000000..8d4947c4 --- /dev/null +++ b/Filmnächte-Scraping.mw @@ -0,0 +1,50 @@ +Bald sind wieder, ganz hipp, '''Filmnächte am Elbufer'''. Leider ist auf http://filmnaechte-am-elbufer.de/ keine sofort verwertbare Information zu finden. + +
require 'htree'
+
+#
+# Saugen
+#
+
+#system("wget -O programm.html 'http://filmnaechte-am-elbufer.de/fn.php?idx=20'")
+
+
+#
+# Parsen
+#
+
+doc = HTree(File.new('programm.html')).to_rexml
+events = []
+spans = {}
+doc.each_element('/html/body//table[@style=\'width: 488px\']/tr/td//span') { |span|
+  text = span.text.to_s
+  text.gsub!(/\ /, ' ')
+
+  spans[span.attributes['class']] = text if text.size > 0
+
+  if span.attributes['class'] == 'progTitle'
+    events << spans
+    spans = {'progDay'=>spans['progDay'],
+             'progTime'=>spans['progTime'],
+             'progWeek'=>spans['progWeek']}
+  end
+}
+
+
+#
+# Ausgabe tabellarisch
+#
+
+column_sizes = Hash.new(0)
+events.each { |event|
+  event.each { |column,cell|
+    column_sizes[column] = cell.size if cell.size > column_sizes[column]
+  }
+}
+
+events.each { |event|
+  ptsd = event['progTitleSpecialDay']
+  puts %w(progWeek progDay progTime progTitle).collect { |column|
+    event[column].ljust(column_sizes[column] + 2)
+  }.to_s.strip + (ptsd ? " (#{ptsd.strip})" : "")
+}