From 1e247b47687d27b6b1b262e00900400cf9b28809 Mon Sep 17 00:00:00 2001 From: Astro Date: Fri, 24 Feb 2023 19:39:30 +0100 Subject: [PATCH] butcher: implement reloadable profanity list --- butcher/config.yaml | 1 + butcher/src/config.rs | 1 + butcher/src/main.rs | 30 ++++++++++-- cave/src/lib.rs | 1 + cave/src/store.rs | 103 +----------------------------------------- cave/src/word_list.rs | 39 ++++++++++++++++ nixos-module.nix | 6 +++ profanity.txt | 99 ++++++++++++++++++++++++++++++++++++++++ 8 files changed, 176 insertions(+), 104 deletions(-) create mode 100644 cave/src/word_list.rs create mode 100644 profanity.txt diff --git a/butcher/config.yaml b/butcher/config.yaml index bdc9129..0625f39 100644 --- a/butcher/config.yaml +++ b/butcher/config.yaml @@ -1,2 +1,3 @@ #redis: redis://10.233.12.2:6379/ redis: redis://127.0.0.1:6378/ +profanity: ../profanity.txt diff --git a/butcher/src/config.rs b/butcher/src/config.rs index 5e4e419..07723de 100644 --- a/butcher/src/config.rs +++ b/butcher/src/config.rs @@ -1,4 +1,5 @@ #[derive(Debug, serde::Deserialize)] pub struct Config { pub redis: String, + pub profanity: String, } diff --git a/butcher/src/main.rs b/butcher/src/main.rs index cdb36c0..44b7306 100644 --- a/butcher/src/main.rs +++ b/butcher/src/main.rs @@ -1,7 +1,13 @@ +use std::{ + sync::Arc, + ops::Deref, +}; use futures::StreamExt; use cave::{ config::LoadConfig, + feed::Post, firehose::FirehoseFactory, + word_list::WordList, }; use trend_setter::UpdateSet; @@ -9,12 +15,28 @@ mod config; mod trend_setter; mod tag_trimmer; +async fn is_profane(profanity: &WordList, post: &Post) -> bool { + if post.sensitive == Some(true) { + return true; + } + + let tags_set = post.tags_set(); + let tagged_profanity = + futures::stream::iter( + tags_set.iter() + ) + .any(|(tag, _spellings)| profanity.contains(tag)); + + tagged_profanity.await +} + #[tokio::main] async fn main() { cave::init::exit_on_panic(); cave::init::init_logger(5555); let config = config::Config::load(); + let profanity = WordList::new(&config.profanity).await; let store = cave::store::Store::new(16, config.redis.clone()).await; @@ -34,6 +56,7 @@ async fn main() { firehose.for_each(move |data| { let trend_setter_tx = trend_setter_tx.clone(); let mut store = store.clone(); + let profanity = profanity.clone(); tokio::spawn(async move { let post = match serde_json::from_slice(&data) { Ok(post) => @@ -43,9 +66,10 @@ async fn main() { return; }, }; - store.save_post_tags(&post).await; - - let update_set = UpdateSet::from(&post); + let post = Arc::new(post); + store.save_post_tags(&post, is_profane(&profanity, &post).await).await; + + let update_set = UpdateSet::from(post.deref()); if ! update_set.is_empty() { trend_setter_tx.send(update_set).await.unwrap(); } diff --git a/cave/src/lib.rs b/cave/src/lib.rs index 2cdf5a5..f5c45a8 100644 --- a/cave/src/lib.rs +++ b/cave/src/lib.rs @@ -6,6 +6,7 @@ pub mod store; pub mod trend_tag; pub mod firehose; pub mod live_file; +pub mod word_list; pub const PERIODS: &[u64] = &[4, 24, 7 * 24]; diff --git a/cave/src/store.rs b/cave/src/store.rs index d14c7ea..f96857c 100644 --- a/cave/src/store.rs +++ b/cave/src/store.rs @@ -18,100 +18,6 @@ const HOST_EXPIRE: usize = 30 * 86400; pub const TREND_POOL_SIZE: usize = 20; pub const IMAGES_PER_TAG: usize = 8; -pub const UNSAFE_TAGS: &[&str] = &[ - "bigdick", - "gayporn", - "porn", - "p0rn", - "pr0n", - "lolita", - "lolitas", - "boob", - "b00b", - "tit", - "tits", - "breast", - "breasts", - "fuck", - "fucked", - "fucking", - "sex", - "sexy", - "anal", - "adult", - "penis", - "dick", - "cock", - "c0ck", - "lewd", - "hentai", - "transselfie", - "femdom", - "kink", - "kinky", - "erotic", - "erotica", - "nude", - "nudism", - "nudist", - "nakt", - "naked", - "exhibitionism", - "flashing", - "piss", - "pee", - "poop", - "shit", - "dogshit", - "gore", - "nsfw", - "nsfwart", - "pussy", - "pussies", - "vagina", - "ass", - "asses", - "arsch", - "ärsche", - "heinie", - "butt", - "butts", - "bukkake", - "cumshot", - "domsub", - "cw", - "bigpenis", - "pokephilia", - "pokeporn", - "tentacle", - "yiff", - "semen", - "rule34", - "r34", - "yaoi", - "swastika", - "hardcore", - "shota", - "dildo", - "nutte", - "nutten", - "whore", - "whores", - "hoe", - "hoes", - "prostitute", - "prostitutes", - "prostitution", - "adultcartoon", - "adultcartoons", - "cartoonporn", - "bigtit", - "bigtits", - "bigboobs", - "blowjob", - "topless", -]; - pub type Error = RedisError; /// wrapper so we can impl ManageConnection @@ -237,7 +143,7 @@ impl Store { Ok(true) } - pub async fn save_post_tags(&mut self, post: &Post) { + pub async fn save_post_tags(&mut self, post: &Post, tagged_unsafe: bool) { if post.account.bot || post.tags.is_empty() { // irrelevant return; @@ -297,11 +203,6 @@ impl Store { .ignore(); } }; - let tags_set = post.tags_set(); - let tagged_unsafe = post.sensitive != Some(false) || - UNSAFE_TAGS.iter().any(|unsafe_tag| - tags_set.contains_key(&unsafe_tag[..]) - ); let images = if !tagged_unsafe { post.media_attachments.iter() .filter(|a| a.media_type == "image") @@ -315,7 +216,7 @@ impl Store { vec![] }; let mut image_keys = vec![]; - for (name, spellings) in tags_set { + for (name, spellings) in post.tags_set() { // global store_tags(&mut cmd, spellings.clone(), diff --git a/cave/src/word_list.rs b/cave/src/word_list.rs new file mode 100644 index 0000000..465e879 --- /dev/null +++ b/cave/src/word_list.rs @@ -0,0 +1,39 @@ +use std::{sync::Arc, collections::HashSet}; + +use tokio::{ + io::{BufReader, AsyncBufReadExt}, + sync::RwLock, +}; + +#[derive(Clone)] +pub struct WordList { + list: Arc>>, +} + +impl WordList { + pub async fn new(path: &str) -> WordList { + let list = crate::live_file::load(path, |file| async move { + let mut list = HashSet::new(); + let mut file = BufReader::new(file); + let mut line = String::new(); + while let Ok(_) = file.read_line(&mut line).await { + if line == "" { + break + } + + list.insert(line.trim_end().to_string()); + + line = String::new(); + } + list + }).await.unwrap(); + + + WordList { list } + } + + pub async fn contains(&self, word: &str) -> bool { + self.list.read().await + .contains(word) + } +} diff --git a/nixos-module.nix b/nixos-module.nix index 93aef4b..e816d29 100644 --- a/nixos-module.nix +++ b/nixos-module.nix @@ -4,6 +4,7 @@ let cfg = config.services.caveman; blocklistPath = "/etc/caveman.blocklist"; + profanityPath = "/etc/caveman.profanity"; hunterDefaultSettings = { redis = "redis://127.0.0.1:${toString cfg.redis.port}/"; @@ -21,6 +22,7 @@ let butcherDefaultSettings = { redis = "redis://127.0.0.1:${toString cfg.redis.port}/"; + profanity = profanityPath; }; butcherSettings = lib.recursiveUpdate butcherDefaultSettings cfg.butcher.settings; @@ -127,6 +129,10 @@ in hunterSettings.prometheus_port ]; + systemd.tmpfiles.rules = [ + "L ${profanityPath} - - - - ${./profanity.txt}" + ]; + services.redis.servers.caveman = lib.mkIf cfg.hunter.enable { enable = true; port = cfg.redis.port; diff --git a/profanity.txt b/profanity.txt new file mode 100644 index 0000000..4b14ff1 --- /dev/null +++ b/profanity.txt @@ -0,0 +1,99 @@ +bigdick +gayporn +porn +p0rn +pr0n +lolita +lolitas +loli +lolicon +boob +b00b +tit +tits +breast +breasts +fuck +fucked +fucking +sex +sexy +anal +adult +penis +dick +cock +c0ck +lewd +hentai +transselfie +femdom +kink +kinky +erotic +erotica +nude +nudism +nudist +nakt +naked +exhibitionism +flashing +piss +pee +poop +shit +dogshit +gore +nsfw +nsfwart +pussy +pussies +vagina +ass +asses +arsch +ärsche +heinie +butt +butts +bukkake +cumshot +domsub +cw +bigpenis +pokephilia +pokeporn +tentacle +yiff +semen +rule34 +r34 +yaoi +swastika +hardcore +shota +dildo +nutte +nutten +whore +whores +hoe +hoes +prostitute +prostitutes +prostitution +adultcartoon +adultcartoons +cartoonporn +bigtit +bigtits +bigboobs +blowjob +topless +masturbate +masturbation +shemale +shemales +beautifulgirls +stripchat