butcher: implement reloadable profanity list

This commit is contained in:
Astro 2023-02-24 19:39:30 +01:00
parent 2776e007c3
commit 1e247b4768
8 changed files with 176 additions and 104 deletions

View File

@ -1,2 +1,3 @@
#redis: redis://10.233.12.2:6379/
redis: redis://127.0.0.1:6378/
profanity: ../profanity.txt

View File

@ -1,4 +1,5 @@
#[derive(Debug, serde::Deserialize)]
pub struct Config {
pub redis: String,
pub profanity: String,
}

View File

@ -1,7 +1,13 @@
use std::{
sync::Arc,
ops::Deref,
};
use futures::StreamExt;
use cave::{
config::LoadConfig,
feed::Post,
firehose::FirehoseFactory,
word_list::WordList,
};
use trend_setter::UpdateSet;
@ -9,12 +15,28 @@ mod config;
mod trend_setter;
mod tag_trimmer;
async fn is_profane(profanity: &WordList, post: &Post) -> bool {
if post.sensitive == Some(true) {
return true;
}
let tags_set = post.tags_set();
let tagged_profanity =
futures::stream::iter(
tags_set.iter()
)
.any(|(tag, _spellings)| profanity.contains(tag));
tagged_profanity.await
}
#[tokio::main]
async fn main() {
cave::init::exit_on_panic();
cave::init::init_logger(5555);
let config = config::Config::load();
let profanity = WordList::new(&config.profanity).await;
let store = cave::store::Store::new(16, config.redis.clone()).await;
@ -34,6 +56,7 @@ async fn main() {
firehose.for_each(move |data| {
let trend_setter_tx = trend_setter_tx.clone();
let mut store = store.clone();
let profanity = profanity.clone();
tokio::spawn(async move {
let post = match serde_json::from_slice(&data) {
Ok(post) =>
@ -43,9 +66,10 @@ async fn main() {
return;
},
};
store.save_post_tags(&post).await;
let update_set = UpdateSet::from(&post);
let post = Arc::new(post);
store.save_post_tags(&post, is_profane(&profanity, &post).await).await;
let update_set = UpdateSet::from(post.deref());
if ! update_set.is_empty() {
trend_setter_tx.send(update_set).await.unwrap();
}

View File

@ -6,6 +6,7 @@ pub mod store;
pub mod trend_tag;
pub mod firehose;
pub mod live_file;
pub mod word_list;
pub const PERIODS: &[u64] = &[4, 24, 7 * 24];

View File

@ -18,100 +18,6 @@ const HOST_EXPIRE: usize = 30 * 86400;
pub const TREND_POOL_SIZE: usize = 20;
pub const IMAGES_PER_TAG: usize = 8;
pub const UNSAFE_TAGS: &[&str] = &[
"bigdick",
"gayporn",
"porn",
"p0rn",
"pr0n",
"lolita",
"lolitas",
"boob",
"b00b",
"tit",
"tits",
"breast",
"breasts",
"fuck",
"fucked",
"fucking",
"sex",
"sexy",
"anal",
"adult",
"penis",
"dick",
"cock",
"c0ck",
"lewd",
"hentai",
"transselfie",
"femdom",
"kink",
"kinky",
"erotic",
"erotica",
"nude",
"nudism",
"nudist",
"nakt",
"naked",
"exhibitionism",
"flashing",
"piss",
"pee",
"poop",
"shit",
"dogshit",
"gore",
"nsfw",
"nsfwart",
"pussy",
"pussies",
"vagina",
"ass",
"asses",
"arsch",
"ärsche",
"heinie",
"butt",
"butts",
"bukkake",
"cumshot",
"domsub",
"cw",
"bigpenis",
"pokephilia",
"pokeporn",
"tentacle",
"yiff",
"semen",
"rule34",
"r34",
"yaoi",
"swastika",
"hardcore",
"shota",
"dildo",
"nutte",
"nutten",
"whore",
"whores",
"hoe",
"hoes",
"prostitute",
"prostitutes",
"prostitution",
"adultcartoon",
"adultcartoons",
"cartoonporn",
"bigtit",
"bigtits",
"bigboobs",
"blowjob",
"topless",
];
pub type Error = RedisError;
/// wrapper so we can impl ManageConnection
@ -237,7 +143,7 @@ impl Store {
Ok(true)
}
pub async fn save_post_tags(&mut self, post: &Post) {
pub async fn save_post_tags(&mut self, post: &Post, tagged_unsafe: bool) {
if post.account.bot || post.tags.is_empty() {
// irrelevant
return;
@ -297,11 +203,6 @@ impl Store {
.ignore();
}
};
let tags_set = post.tags_set();
let tagged_unsafe = post.sensitive != Some(false) ||
UNSAFE_TAGS.iter().any(|unsafe_tag|
tags_set.contains_key(&unsafe_tag[..])
);
let images = if !tagged_unsafe {
post.media_attachments.iter()
.filter(|a| a.media_type == "image")
@ -315,7 +216,7 @@ impl Store {
vec![]
};
let mut image_keys = vec![];
for (name, spellings) in tags_set {
for (name, spellings) in post.tags_set() {
// global
store_tags(&mut cmd,
spellings.clone(),

39
cave/src/word_list.rs Normal file
View File

@ -0,0 +1,39 @@
use std::{sync::Arc, collections::HashSet};
use tokio::{
io::{BufReader, AsyncBufReadExt},
sync::RwLock,
};
#[derive(Clone)]
pub struct WordList {
list: Arc<RwLock<HashSet<String>>>,
}
impl WordList {
pub async fn new(path: &str) -> WordList {
let list = crate::live_file::load(path, |file| async move {
let mut list = HashSet::new();
let mut file = BufReader::new(file);
let mut line = String::new();
while let Ok(_) = file.read_line(&mut line).await {
if line == "" {
break
}
list.insert(line.trim_end().to_string());
line = String::new();
}
list
}).await.unwrap();
WordList { list }
}
pub async fn contains(&self, word: &str) -> bool {
self.list.read().await
.contains(word)
}
}

View File

@ -4,6 +4,7 @@ let
cfg = config.services.caveman;
blocklistPath = "/etc/caveman.blocklist";
profanityPath = "/etc/caveman.profanity";
hunterDefaultSettings = {
redis = "redis://127.0.0.1:${toString cfg.redis.port}/";
@ -21,6 +22,7 @@ let
butcherDefaultSettings = {
redis = "redis://127.0.0.1:${toString cfg.redis.port}/";
profanity = profanityPath;
};
butcherSettings = lib.recursiveUpdate butcherDefaultSettings cfg.butcher.settings;
@ -127,6 +129,10 @@ in
hunterSettings.prometheus_port
];
systemd.tmpfiles.rules = [
"L ${profanityPath} - - - - ${./profanity.txt}"
];
services.redis.servers.caveman = lib.mkIf cfg.hunter.enable {
enable = true;
port = cfg.redis.port;

99
profanity.txt Normal file
View File

@ -0,0 +1,99 @@
bigdick
gayporn
porn
p0rn
pr0n
lolita
lolitas
loli
lolicon
boob
b00b
tit
tits
breast
breasts
fuck
fucked
fucking
sex
sexy
anal
adult
penis
dick
cock
c0ck
lewd
hentai
transselfie
femdom
kink
kinky
erotic
erotica
nude
nudism
nudist
nakt
naked
exhibitionism
flashing
piss
pee
poop
shit
dogshit
gore
nsfw
nsfwart
pussy
pussies
vagina
ass
asses
arsch
ärsche
heinie
butt
butts
bukkake
cumshot
domsub
cw
bigpenis
pokephilia
pokeporn
tentacle
yiff
semen
rule34
r34
yaoi
swastika
hardcore
shota
dildo
nutte
nutten
whore
whores
hoe
hoes
prostitute
prostitutes
prostitution
adultcartoon
adultcartoons
cartoonporn
bigtit
bigtits
bigboobs
blowjob
topless
masturbate
masturbation
shemale
shemales
beautifulgirls
stripchat