From 7c8c3db396ee7310a5f7191010b54a5e4a070607 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20L=C3=B6tzsch?= Date: Sat, 5 Mar 2022 16:27:19 +0100 Subject: [PATCH] Parse mbox compatible to clojure-mail --- import/api/wpforms-mails/project.clj | 12 +++++ .../wpforms-mails/src/mbox_parser/core.clj | 31 ++++++++++++ .../wpforms-mails/src/wpforms_mails/core.clj | 48 +++++++++++++++++++ 3 files changed, 91 insertions(+) create mode 100644 import/api/wpforms-mails/project.clj create mode 100644 import/api/wpforms-mails/src/mbox_parser/core.clj create mode 100644 import/api/wpforms-mails/src/wpforms_mails/core.clj diff --git a/import/api/wpforms-mails/project.clj b/import/api/wpforms-mails/project.clj new file mode 100644 index 0000000..b7bff55 --- /dev/null +++ b/import/api/wpforms-mails/project.clj @@ -0,0 +1,12 @@ +(defproject wpforms-mails "0.1.0-SNAPSHOT" + :description "Parse mails with datasets from WPForms" + :url "https://mission-lifeline.de/unterkunft-bereitstellen" + :license {:name "EPL-2.0 OR GPL-2.0-or-later WITH Classpath-exception-2.0" + :url "https://www.eclipse.org/legal/epl-2.0/"} + :dependencies [[org.clojure/clojure "1.10.3"] + [yogthos/config "1.2.0"] + [io.forward/clojure-mail "1.0.8"]] + :main ^:skip-aot wpforms-mails.core + :target-path "target/%s" + :profiles {:uberjar {:aot :all + :jvm-opts ["-Dclojure.compiler.direct-linking=true"]}}) diff --git a/import/api/wpforms-mails/src/mbox_parser/core.clj b/import/api/wpforms-mails/src/mbox_parser/core.clj new file mode 100644 index 0000000..196ffd8 --- /dev/null +++ b/import/api/wpforms-mails/src/mbox_parser/core.clj @@ -0,0 +1,31 @@ +(ns mbox-parser.core + "https://github.com/diavoletto76/mbox-parser/blob/master/src/mbox_parser/core.clj") + +(defn- mbox-separator? + "As per RFC 4155 (https://tools.ietf.org/html/rfc4155) Mbox separator is + a sequence of empty line + line containing \"From \"" + + ;; TODO Enhance regexp to better match line2 + [line1 line2] + (and (empty? line1) + (boolean (re-matches #"^From .*" line2)))) + + +(defn parse-lines + "Given line-seq returns a lazy seq of messages. Messages are + represented as a lazy sequence of lines." + [lines] + (let [fixed-lines (cons "" lines)] + (->> (map list fixed-lines (rest fixed-lines)) + (partition-by (fn [[a b]] (mbox-separator? a b))) + (filter (fn [[[a b]]] ((complement mbox-separator?) a b))) + (map #(map first %)) + (map rest)))) + + +(defn parse-reader + "Given the BufferedReader of mbox returns a lazy seq of messages + contained in mbox itself. Messages are represented as lazy sequence of lines" + [reader] + (->> (line-seq reader) + (parse-lines))) diff --git a/import/api/wpforms-mails/src/wpforms_mails/core.clj b/import/api/wpforms-mails/src/wpforms_mails/core.clj new file mode 100644 index 0000000..cb47172 --- /dev/null +++ b/import/api/wpforms-mails/src/wpforms_mails/core.clj @@ -0,0 +1,48 @@ +(ns wpforms-mails.core + (:require [config.core :refer [env]] + [clojure.java.io :as io] + [mbox-parser.core :as mbox] + [clojure.string :refer [join]] + [clojure-mail.message :as cmm]) + (:import [java.util Properties] + [javax.mail Session] + [javax.mail.internet MimeMessage]) + (:gen-class)) + +(defn mbox->emls + "split an .mbox file (multiple mails) into a sequence of mails" + [filename] + (->> (io/reader filename) + (mbox-parser.core/parse-reader) + (map #(join "\n" %)))) + +(defn eml->message + "convert an eml string into a MimeMessage" + [eml] + (let [props (Session/getDefaultInstance (Properties.)) + is (java.io.ByteArrayInputStream. (.getBytes eml #_"UTF-8"))] + (MimeMessage. props is))) + +(defn file->messages + "a substitution for [(cmc/file->message filename)] that can handle files containing multiple mails (mbox)" + [filename] + (map eml->message (mbox->emls filename))) + +(defn message->html + "parse the html body of a MimeMessage" + [message] + (let [msg:edn (cmm/read-message message)] + (when (= (:content-type msg:edn) "text/html; charset=utf-8") + (-> msg:edn :body :body)))) + +(defn -main + [& _args] + (map message->html + (file->messages (:wpforms-mails-file env)))) + +(comment + (count (mbox->emls (:wpforms-mails-file env))) + (count (file->messages (:wpforms-mails-file env))) + (message->html (cmc/file->message "/tmp/example")) + (message->html (second (file->messages (:wpforms-mails-file env)))) + (-main))