Parse mbox compatible to clojure-mail
This commit is contained in:
parent
52cd3b6ec3
commit
7c8c3db396
|
@ -0,0 +1,12 @@
|
||||||
|
(defproject wpforms-mails "0.1.0-SNAPSHOT"
|
||||||
|
:description "Parse mails with datasets from WPForms"
|
||||||
|
:url "https://mission-lifeline.de/unterkunft-bereitstellen"
|
||||||
|
:license {:name "EPL-2.0 OR GPL-2.0-or-later WITH Classpath-exception-2.0"
|
||||||
|
:url "https://www.eclipse.org/legal/epl-2.0/"}
|
||||||
|
:dependencies [[org.clojure/clojure "1.10.3"]
|
||||||
|
[yogthos/config "1.2.0"]
|
||||||
|
[io.forward/clojure-mail "1.0.8"]]
|
||||||
|
:main ^:skip-aot wpforms-mails.core
|
||||||
|
:target-path "target/%s"
|
||||||
|
:profiles {:uberjar {:aot :all
|
||||||
|
:jvm-opts ["-Dclojure.compiler.direct-linking=true"]}})
|
|
@ -0,0 +1,31 @@
|
||||||
|
(ns mbox-parser.core
|
||||||
|
"https://github.com/diavoletto76/mbox-parser/blob/master/src/mbox_parser/core.clj")
|
||||||
|
|
||||||
|
(defn- mbox-separator?
|
||||||
|
"As per RFC 4155 (https://tools.ietf.org/html/rfc4155) Mbox separator is
|
||||||
|
a sequence of empty line + line containing \"From <email> <timestamp>\""
|
||||||
|
|
||||||
|
;; TODO Enhance regexp to better match line2
|
||||||
|
[line1 line2]
|
||||||
|
(and (empty? line1)
|
||||||
|
(boolean (re-matches #"^From .*" line2))))
|
||||||
|
|
||||||
|
|
||||||
|
(defn parse-lines
|
||||||
|
"Given line-seq returns a lazy seq of messages. Messages are
|
||||||
|
represented as a lazy sequence of lines."
|
||||||
|
[lines]
|
||||||
|
(let [fixed-lines (cons "" lines)]
|
||||||
|
(->> (map list fixed-lines (rest fixed-lines))
|
||||||
|
(partition-by (fn [[a b]] (mbox-separator? a b)))
|
||||||
|
(filter (fn [[[a b]]] ((complement mbox-separator?) a b)))
|
||||||
|
(map #(map first %))
|
||||||
|
(map rest))))
|
||||||
|
|
||||||
|
|
||||||
|
(defn parse-reader
|
||||||
|
"Given the BufferedReader of mbox returns a lazy seq of messages
|
||||||
|
contained in mbox itself. Messages are represented as lazy sequence of lines"
|
||||||
|
[reader]
|
||||||
|
(->> (line-seq reader)
|
||||||
|
(parse-lines)))
|
|
@ -0,0 +1,48 @@
|
||||||
|
(ns wpforms-mails.core
|
||||||
|
(:require [config.core :refer [env]]
|
||||||
|
[clojure.java.io :as io]
|
||||||
|
[mbox-parser.core :as mbox]
|
||||||
|
[clojure.string :refer [join]]
|
||||||
|
[clojure-mail.message :as cmm])
|
||||||
|
(:import [java.util Properties]
|
||||||
|
[javax.mail Session]
|
||||||
|
[javax.mail.internet MimeMessage])
|
||||||
|
(:gen-class))
|
||||||
|
|
||||||
|
(defn mbox->emls
|
||||||
|
"split an .mbox file (multiple mails) into a sequence of mails"
|
||||||
|
[filename]
|
||||||
|
(->> (io/reader filename)
|
||||||
|
(mbox-parser.core/parse-reader)
|
||||||
|
(map #(join "\n" %))))
|
||||||
|
|
||||||
|
(defn eml->message
|
||||||
|
"convert an eml string into a MimeMessage"
|
||||||
|
[eml]
|
||||||
|
(let [props (Session/getDefaultInstance (Properties.))
|
||||||
|
is (java.io.ByteArrayInputStream. (.getBytes eml #_"UTF-8"))]
|
||||||
|
(MimeMessage. props is)))
|
||||||
|
|
||||||
|
(defn file->messages
|
||||||
|
"a substitution for [(cmc/file->message filename)] that can handle files containing multiple mails (mbox)"
|
||||||
|
[filename]
|
||||||
|
(map eml->message (mbox->emls filename)))
|
||||||
|
|
||||||
|
(defn message->html
|
||||||
|
"parse the html body of a MimeMessage"
|
||||||
|
[message]
|
||||||
|
(let [msg:edn (cmm/read-message message)]
|
||||||
|
(when (= (:content-type msg:edn) "text/html; charset=utf-8")
|
||||||
|
(-> msg:edn :body :body))))
|
||||||
|
|
||||||
|
(defn -main
|
||||||
|
[& _args]
|
||||||
|
(map message->html
|
||||||
|
(file->messages (:wpforms-mails-file env))))
|
||||||
|
|
||||||
|
(comment
|
||||||
|
(count (mbox->emls (:wpforms-mails-file env)))
|
||||||
|
(count (file->messages (:wpforms-mails-file env)))
|
||||||
|
(message->html (cmc/file->message "/tmp/example"))
|
||||||
|
(message->html (second (file->messages (:wpforms-mails-file env))))
|
||||||
|
(-main))
|
Loading…
Reference in New Issue