caveman/cave/src/trend_tag.rs

185 lines
5.3 KiB
Rust
Raw Normal View History

2022-11-11 16:45:43 +01:00
use std::collections::{BTreeSet, HashMap};
2022-11-05 20:04:31 +01:00
use redis::{
aio::ConnectionManager,
RedisError,
};
const MIN_AFTER_MENTIONS: usize = 3;
#[derive(Debug)]
2022-11-08 00:43:46 +01:00
pub struct TrendTag {
pub name: String,
2022-11-12 01:02:44 +01:00
pub by_hour: Vec<(u64, usize)>,
2022-11-05 20:04:31 +01:00
other: Vec<(String, String)>,
}
2022-11-08 00:43:46 +01:00
impl TrendTag {
2022-11-05 20:04:31 +01:00
pub async fn for_each<F: FnMut(Self)>(
redis_man: &mut ConnectionManager,
language: Option<String>,
mut f: F,
) -> Result<(), RedisError> {
let prefix = match language {
Some(language) => format!("l:{}:", language),
None => "g:".to_string(),
};
let mut cursor = None;
while cursor != Some(0) {
let mut cmd = redis::cmd("SCAN");
cmd.cursor_arg(cursor.unwrap_or(0))
.arg("MATCH").arg(format!("{}*", prefix))
.arg("COUNT").arg(1000);
let (next_cursor, keys) =
cmd.query_async::<_, (u64, Vec<String>)>(redis_man)
.await?;
let mut cmd = redis::pipe();
for key in &keys {
cmd.hgetall(key);
}
let others =
cmd.query_async::<_, Vec<Vec<String>>>(redis_man)
.await?;
for (key, other) in keys.iter().zip(others) {
let name = key[prefix.len()..].to_string();
let tag = Self::from_hash(name, other);
f(tag);
}
cursor = Some(next_cursor);
}
Ok(())
}
pub(crate) fn from_hash(name: String, hash_values: Vec<String>) -> Self {
2022-11-05 20:04:31 +01:00
let mut by_hour = Vec::with_capacity(hash_values.len() / 2);
let mut other = Vec::with_capacity(hash_values.len() / 2);
let mut key: Option<String> = None;
for value in hash_values.into_iter() {
if let Some(key) = key.take() {
if &key[..2] == "t:" {
if let (Ok(hour), Ok(value)) = (str::parse(&key[2..]), str::parse(&value)) {
by_hour.push((hour, value));
}
} else if let Ok(value) = str::parse(&value) {
other.push((key, value));
}
} else {
key = Some(value);
}
}
#[cfg(debug)]
by_hour.sort();
2022-11-08 00:43:46 +01:00
TrendTag {
name,
2022-11-05 20:04:31 +01:00
by_hour,
other,
}
}
pub fn score(&self, period: u64, until: u64) -> f64 {
2022-11-09 19:05:40 +01:00
// ignore spam that comes from only 1 instance
if self.hosts().skip(1).next().is_none() {
2022-11-10 15:23:02 +01:00
return -1.;
2022-11-09 19:05:40 +01:00
}
2022-11-05 20:04:31 +01:00
let from = until - period;
let not_before = from - 3 * period;
2022-11-05 20:04:31 +01:00
let mut before_mentions = 0;
let mut before_hours = 0;
let mut after_mentions = 0;
for (hour, mentions) in self.by_hour.iter().cloned() {
if hour > from {
after_mentions += mentions;
} else if hour > not_before {
2022-11-05 20:04:31 +01:00
before_mentions += mentions;
before_hours += 1;
}
}
if after_mentions < MIN_AFTER_MENTIONS * (period as usize) {
return 0.;
}
let before = if before_hours > 0 && before_mentions > 0 {
(before_mentions as f64) / (before_hours as f64)
} else { 0.1 };
let after = (after_mentions as f64) / (period as f64);
after / before
}
2022-11-06 02:20:36 +01:00
2022-11-11 16:45:43 +01:00
pub fn hour_scores(&self, period: u64, until: u64) -> Vec<usize> {
let hours = self.by_hour.iter().cloned()
.collect::<HashMap<_, _>>();
let from = until - period;
let not_before = from - 3 * period;
(not_before + 1 ..= until).map(|hour|
*hours.get(&hour).unwrap_or(&0)
).collect()
}
pub fn hour_scores_data(&self, period: u64, until: u64) -> String {
self.hour_scores(period, until)
2022-11-11 16:45:43 +01:00
.iter()
.map(|count| format!("{} ", count))
.collect()
}
2022-11-06 02:20:36 +01:00
fn spellings(&self) -> impl Iterator<Item = (usize, &str)> {
self.other.iter()
.filter_map(|(key, value)| {
if &key[..2] != "s:" {
return None;
}
if let Ok(count) = str::parse(value) {
return Some((count, &key[2..]));
}
None
})
}
pub fn spelling(&self) -> &str {
self.spellings()
.map(|(count, spelling)| {
if spelling.chars().any(|c| c.is_uppercase()) {
// favor captialized spelling
(10 * count, spelling)
} else {
(count, spelling)
}
})
.max()
.map(|(_count, spelling)| spelling)
.unwrap_or(&self.name)
}
2022-11-06 23:49:42 +01:00
pub fn hosts(&self) -> impl Iterator<Item = (usize, &str)> {
2022-11-06 02:20:36 +01:00
self.other.iter()
.filter_map(|(key, value)| {
if &key[..2] != "h:" {
return None;
}
if let Ok(count) = str::parse(value) {
return Some((count, &key[2..]));
}
None
})
2022-11-06 23:49:42 +01:00
}
/// ordered by count
pub fn hosts_set(&self) -> BTreeSet<(usize, &str)> {
self.hosts().collect()
2022-11-06 02:20:36 +01:00
}
2022-11-05 20:04:31 +01:00
}