2022-11-11 16:45:43 +01:00
|
|
|
use std::collections::{BTreeSet, HashMap};
|
2022-11-05 20:04:31 +01:00
|
|
|
use redis::{
|
|
|
|
aio::ConnectionManager,
|
|
|
|
RedisError,
|
|
|
|
};
|
|
|
|
|
|
|
|
const MIN_AFTER_MENTIONS: usize = 3;
|
|
|
|
|
|
|
|
#[derive(Debug)]
|
2022-11-08 00:43:46 +01:00
|
|
|
pub struct TrendTag {
|
2022-11-09 18:11:02 +01:00
|
|
|
pub name: String,
|
2022-11-12 01:02:44 +01:00
|
|
|
pub by_hour: Vec<(u64, usize)>,
|
2022-11-05 20:04:31 +01:00
|
|
|
other: Vec<(String, String)>,
|
|
|
|
}
|
|
|
|
|
2022-11-08 00:43:46 +01:00
|
|
|
impl TrendTag {
|
2022-11-05 20:04:31 +01:00
|
|
|
pub async fn for_each<F: FnMut(Self)>(
|
|
|
|
redis_man: &mut ConnectionManager,
|
|
|
|
language: Option<String>,
|
|
|
|
mut f: F,
|
|
|
|
) -> Result<(), RedisError> {
|
|
|
|
let prefix = match language {
|
|
|
|
Some(language) => format!("l:{}:", language),
|
|
|
|
None => "g:".to_string(),
|
|
|
|
};
|
|
|
|
|
|
|
|
let mut cursor = None;
|
|
|
|
while cursor != Some(0) {
|
|
|
|
let mut cmd = redis::cmd("SCAN");
|
|
|
|
cmd.cursor_arg(cursor.unwrap_or(0))
|
|
|
|
.arg("MATCH").arg(format!("{}*", prefix))
|
|
|
|
.arg("COUNT").arg(1000);
|
|
|
|
let (next_cursor, keys) =
|
|
|
|
cmd.query_async::<_, (u64, Vec<String>)>(redis_man)
|
|
|
|
.await?;
|
|
|
|
|
|
|
|
let mut cmd = redis::pipe();
|
|
|
|
for key in &keys {
|
|
|
|
cmd.hgetall(key);
|
|
|
|
}
|
|
|
|
let others =
|
|
|
|
cmd.query_async::<_, Vec<Vec<String>>>(redis_man)
|
|
|
|
.await?;
|
|
|
|
for (key, other) in keys.iter().zip(others) {
|
|
|
|
let name = key[prefix.len()..].to_string();
|
|
|
|
let tag = Self::from_hash(name, other);
|
|
|
|
f(tag);
|
|
|
|
}
|
|
|
|
|
|
|
|
cursor = Some(next_cursor);
|
|
|
|
}
|
|
|
|
|
|
|
|
Ok(())
|
|
|
|
}
|
|
|
|
|
2022-11-09 18:11:02 +01:00
|
|
|
pub(crate) fn from_hash(name: String, hash_values: Vec<String>) -> Self {
|
2022-11-05 20:04:31 +01:00
|
|
|
let mut by_hour = Vec::with_capacity(hash_values.len() / 2);
|
|
|
|
let mut other = Vec::with_capacity(hash_values.len() / 2);
|
|
|
|
|
|
|
|
let mut key: Option<String> = None;
|
|
|
|
for value in hash_values.into_iter() {
|
|
|
|
if let Some(key) = key.take() {
|
|
|
|
if &key[..2] == "t:" {
|
|
|
|
if let (Ok(hour), Ok(value)) = (str::parse(&key[2..]), str::parse(&value)) {
|
|
|
|
by_hour.push((hour, value));
|
|
|
|
}
|
|
|
|
} else if let Ok(value) = str::parse(&value) {
|
|
|
|
other.push((key, value));
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
key = Some(value);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
#[cfg(debug)]
|
|
|
|
by_hour.sort();
|
|
|
|
|
2022-11-08 00:43:46 +01:00
|
|
|
TrendTag {
|
2022-11-09 18:11:02 +01:00
|
|
|
name,
|
2022-11-05 20:04:31 +01:00
|
|
|
by_hour,
|
|
|
|
other,
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
pub fn score(&self, period: u64, until: u64) -> f64 {
|
2022-11-09 19:05:40 +01:00
|
|
|
// ignore spam that comes from only 1 instance
|
|
|
|
if self.hosts().skip(1).next().is_none() {
|
2022-11-10 15:23:02 +01:00
|
|
|
return -1.;
|
2022-11-09 19:05:40 +01:00
|
|
|
}
|
|
|
|
|
2022-11-05 20:04:31 +01:00
|
|
|
let from = until - period;
|
2022-11-10 03:28:20 +01:00
|
|
|
let not_before = from - 3 * period;
|
2022-11-05 20:04:31 +01:00
|
|
|
let mut before_mentions = 0;
|
|
|
|
let mut before_hours = 0;
|
|
|
|
let mut after_mentions = 0;
|
|
|
|
|
|
|
|
for (hour, mentions) in self.by_hour.iter().cloned() {
|
2022-11-10 03:28:20 +01:00
|
|
|
if hour > from {
|
|
|
|
after_mentions += mentions;
|
|
|
|
} else if hour > not_before {
|
2022-11-05 20:04:31 +01:00
|
|
|
before_mentions += mentions;
|
|
|
|
before_hours += 1;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if after_mentions < MIN_AFTER_MENTIONS * (period as usize) {
|
|
|
|
return 0.;
|
|
|
|
}
|
|
|
|
|
|
|
|
let before = if before_hours > 0 && before_mentions > 0 {
|
|
|
|
(before_mentions as f64) / (before_hours as f64)
|
|
|
|
} else { 0.1 };
|
|
|
|
let after = (after_mentions as f64) / (period as f64);
|
|
|
|
after / before
|
|
|
|
}
|
2022-11-06 02:20:36 +01:00
|
|
|
|
2022-11-11 16:45:43 +01:00
|
|
|
pub fn hour_scores(&self, period: u64, until: u64) -> Vec<usize> {
|
|
|
|
let hours = self.by_hour.iter().cloned()
|
|
|
|
.collect::<HashMap<_, _>>();
|
|
|
|
|
|
|
|
let from = until - period;
|
|
|
|
let not_before = from - 3 * period;
|
|
|
|
(not_before + 1 ..= until).map(|hour|
|
|
|
|
*hours.get(&hour).unwrap_or(&0)
|
|
|
|
).collect()
|
|
|
|
}
|
|
|
|
|
2022-11-11 19:55:01 +01:00
|
|
|
pub fn hour_scores_data(&self, period: u64, until: u64) -> String {
|
|
|
|
self.hour_scores(period, until)
|
2022-11-11 16:45:43 +01:00
|
|
|
.iter()
|
|
|
|
.map(|count| format!("{} ", count))
|
|
|
|
.collect()
|
|
|
|
}
|
|
|
|
|
2022-11-06 02:20:36 +01:00
|
|
|
fn spellings(&self) -> impl Iterator<Item = (usize, &str)> {
|
|
|
|
self.other.iter()
|
|
|
|
.filter_map(|(key, value)| {
|
|
|
|
if &key[..2] != "s:" {
|
|
|
|
return None;
|
|
|
|
}
|
|
|
|
|
|
|
|
if let Ok(count) = str::parse(value) {
|
|
|
|
return Some((count, &key[2..]));
|
|
|
|
}
|
|
|
|
|
|
|
|
None
|
|
|
|
})
|
|
|
|
}
|
|
|
|
|
|
|
|
pub fn spelling(&self) -> &str {
|
|
|
|
self.spellings()
|
|
|
|
.map(|(count, spelling)| {
|
|
|
|
if spelling.chars().any(|c| c.is_uppercase()) {
|
|
|
|
// favor captialized spelling
|
|
|
|
(10 * count, spelling)
|
|
|
|
} else {
|
|
|
|
(count, spelling)
|
|
|
|
}
|
|
|
|
})
|
|
|
|
.max()
|
|
|
|
.map(|(_count, spelling)| spelling)
|
|
|
|
.unwrap_or(&self.name)
|
|
|
|
}
|
|
|
|
|
2022-11-06 23:49:42 +01:00
|
|
|
pub fn hosts(&self) -> impl Iterator<Item = (usize, &str)> {
|
2022-11-06 02:20:36 +01:00
|
|
|
self.other.iter()
|
|
|
|
.filter_map(|(key, value)| {
|
|
|
|
if &key[..2] != "h:" {
|
|
|
|
return None;
|
|
|
|
}
|
|
|
|
|
|
|
|
if let Ok(count) = str::parse(value) {
|
|
|
|
return Some((count, &key[2..]));
|
|
|
|
}
|
|
|
|
|
|
|
|
None
|
|
|
|
})
|
2022-11-06 23:49:42 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
/// ordered by count
|
|
|
|
pub fn hosts_set(&self) -> BTreeSet<(usize, &str)> {
|
|
|
|
self.hosts().collect()
|
2022-11-06 02:20:36 +01:00
|
|
|
}
|
2022-11-05 20:04:31 +01:00
|
|
|
}
|