caveman/gatherer/src/trends.rs

123 lines
3.8 KiB
Rust

use std::{cmp::Ordering, collections::HashSet};
use std::collections::BTreeMap;
use std::sync::Arc;
use std::time::Instant;
use redis::{
RedisError,
};
use cave::current_hour;
use cave::store::Store;
use cave::trend_tag::TrendTag;
pub type TrendsResults = Vec<(u64, u64, Vec<(f64, Arc<TrendTag>)>)>;
#[derive(Debug, Clone, PartialEq, PartialOrd)]
pub struct ScoreKey {
score: f64,
tag: Arc<String>,
}
impl Eq for ScoreKey {}
impl Ord for ScoreKey {
fn cmp(&self, other: &Self) -> Ordering {
if self.score == other.score {
self.tag.as_ref().cmp(other.tag.as_ref())
} else if self.score < other.score {
Ordering::Less
} else {
Ordering::Greater
}
}
}
#[derive(Debug)]
pub struct TrendAnalyzer {
/// in hours
period: u64,
/// *now* in hours
until: u64,
size: usize,
/// key contains name to avoid collision by just score
pub result: BTreeMap<ScoreKey, Arc<TrendTag>>,
score_threshold: Option<f64>,
}
impl TrendAnalyzer {
pub async fn run(
store: &mut Store,
size: usize,
periods: &[u64],
language: Option<String>,
) -> Result<TrendsResults, RedisError> {
let lang = if language.is_some() { "some" } else { "any" };
let until = current_hour();
let mut analyzers: Vec<TrendAnalyzer> = periods.iter()
.copied()
.map(|period| TrendAnalyzer {
period,
until,
size,
result: BTreeMap::new(),
score_threshold: None,
}).collect();
let t1 = Instant::now();
let tags = store.get_trend_pools(&language, periods).await?
.into_iter()
.flat_map(|(_period, tags)| tags.into_iter())
.collect::<HashSet<String>>();
let tags_len = tags.len();
let t2 = Instant::now();
let trend_tags = store.get_trend_tags(&language, tags.into_iter()).await?;
let t3 = Instant::now();
metrics::histogram!("trends_page_time", t2 - t1, "step" => "get_trend_pools", "lang" => lang);
metrics::histogram!("trends_page_time", t3 - t2, "step" => "get_trend_tags", "lang" => lang);
metrics::histogram!("trends_page_tags", tags_len as f64, "lang" => lang);
for trend_tag in trend_tags {
let trend_tag = Arc::new(trend_tag);
let name = Arc::new(trend_tag.name.clone());
for analyzer in &mut analyzers {
analyzer.process_tag(&name, &trend_tag);
}
}
let results = analyzers.into_iter()
.map(|analyzer| {
let result = analyzer.result.iter()
.rev()
.map(|(key, tag)| (key.score, tag.clone()))
.collect();
(analyzer.until, analyzer.period, result)
})
.collect();
let t4 = Instant::now();
metrics::histogram!("trends_page_time", t4 - t3, "step" => "analyze", "lang" => lang);
Ok(results)
}
pub fn process_tag(&mut self, name: &Arc<String>, tag: &Arc<TrendTag>) {
let score = tag.score(self.period, self.until);
if score <= 0. {
return;
}
if self.result.len() >= self.size &&
self.score_threshold.map_or(false, |score_threshold| score < score_threshold) {
// score is below self.result[..self.size].score
return;
}
self.result.insert(ScoreKey { score, tag: name.clone(), }, tag.clone());
let mut least = self.result.keys().next().cloned().unwrap();
if self.result.len() > self.size {
self.result.remove(&least);
least = self.result.keys().next().cloned().unwrap().clone();
}
self.score_threshold = Some(least.score);
}
}