caveman/cave/src/feed.rs

296 lines
8.4 KiB
Rust

use std::{collections::{HashMap, HashSet}, time::Duration, ops::Deref};
use chrono::{DateTime, FixedOffset};
use futures::{Stream, StreamExt};
use eventsource_stream::Eventsource;
use reqwest::StatusCode;
#[must_use] pub fn url_host(url: &str) -> Option<String> {
reqwest::Url::parse(url)
.map_err(|e| tracing::warn!("Cannot parse url {:?}: {}", url, e))
.ok()
.and_then(|url| url.domain()
.map(str::to_lowercase)
)
}
#[derive(Debug, serde::Serialize, serde::Deserialize)]
pub struct Account {
pub username: String,
pub display_name: String,
pub url: String,
pub bot: bool,
// pub avatar: String,
// pub header: String,
}
impl Account {
#[must_use] pub fn host(&self) -> Option<String> {
url_host(&self.url)
}
}
#[derive(Debug, serde::Serialize, serde::Deserialize)]
pub struct Tag {
pub name: String,
}
#[derive(Debug, serde::Serialize, serde::Deserialize)]
pub struct Application {
pub name: String,
pub website: Option<String>,
}
#[derive(Debug, serde::Serialize, serde::Deserialize)]
pub struct Mention {
pub username: String,
pub url: String,
pub acct: Option<String>,
}
impl Mention {
#[must_use] pub fn host(&self) -> Option<String> {
url_host(&self.url)
}
}
#[derive(Debug, serde::Serialize, serde::Deserialize)]
pub struct MediaAttachment {
#[serde(rename = "type")]
pub media_type: String,
pub remote_url: Option<String>,
}
#[derive(Debug, serde::Serialize, serde::Deserialize)]
pub struct Post {
pub created_at: String,
pub url: String,
#[serde(default = "String::new")]
pub content: String,
pub account: Account,
#[serde(default)]
pub tags: Vec<Tag>,
// pub application: Option<Application>,
pub sensitive: Option<bool>,
#[serde(default)]
pub mentions: Vec<Mention>,
pub language: Option<String>,
#[serde(default)]
pub media_attachments: Vec<MediaAttachment>,
#[serde(default)]
pub reblog: Option<Box<Post>>,
}
impl Post {
#[must_use] pub fn url_host(&self) -> Option<String> {
reqwest::Url::parse(&self.url)
.ok()
.and_then(|url| url.domain()
.map(std::borrow::ToOwned::to_owned)
)
}
#[must_use] pub fn user_id(&self) -> Option<String> {
let username = self.account.username.to_lowercase();
let host = self.url_host()?;
Some(format!("{username}@{host}"))
}
#[must_use] pub fn timestamp(&self) -> Option<DateTime<FixedOffset>> {
DateTime::parse_from_rfc3339(&self.created_at)
.ok()
}
/// clip "en-us" to "en"
#[must_use] pub fn lang(&self) -> Option<String> {
let language = match &self.language {
Some(language) => language,
None => return None,
};
if language.len() < 2 {
None
} else if language.len() == 2 {
Some(language.to_lowercase())
} else {
Some(language[..2].to_lowercase())
}
}
pub fn tags_set(&self) -> HashMap<String, HashSet<String>> {
let mut result: HashMap<String, HashSet<String>> = HashMap::with_capacity(self.tags.len());
for tag in &self.tags {
let name = tag.name.to_lowercase();
if name.contains(char::is_whitespace) {
continue;
}
match result.entry(name) {
std::collections::hash_map::Entry::Vacant(entry) => {
let mut r = HashSet::new();
r.insert(tag.name.clone());
entry.insert(r);
}
std::collections::hash_map::Entry::Occupied(mut entry) => {
entry.get_mut().insert(tag.name.clone());
}
}
}
result
}
}
#[derive(Debug)]
enum EncodedPost {
Value(serde_json::Value),
Bytes(Vec<u8>),
Stolen,
}
// TODO: eliminate
/// Wraps a `Post` along with a serializable form that is most close
/// to the original incoming data
#[derive(Debug)]
pub struct EncodablePost {
pub event_type: String,
post: Post,
encoded: EncodedPost,
}
impl Deref for EncodablePost {
type Target = Post;
fn deref(&self) -> &Self::Target {
&self.post
}
}
impl EncodablePost {
pub fn from_post(event_type: String, post: Post) -> Result<Self, serde_json::Error> {
let bytes = serde_json::to_vec(&post)?;
Ok(EncodablePost {
event_type,
post,
encoded: EncodedPost::Bytes(bytes),
})
}
pub fn from_value(event_type: String, value: serde_json::Value) -> Result<Self, serde_json::Error> {
let post = serde_json::from_value(value.clone())?;
Ok(EncodablePost {
event_type,
post,
encoded: EncodedPost::Value(value),
})
}
pub fn from_bytes(event_type: String, bytes: Vec<u8>) -> Result<Self, serde_json::Error> {
let post = serde_json::from_slice(&bytes)?;
Ok(EncodablePost {
event_type,
post,
encoded: EncodedPost::Bytes(bytes),
})
}
pub fn encode(&mut self) -> Result<Vec<u8>, serde_json::Error> {
use std::mem::replace;
let encoded = replace(&mut self.encoded, EncodedPost::Stolen);
match encoded {
EncodedPost::Value(value) =>
serde_json::to_vec(&value),
EncodedPost::Bytes(bytes) =>
Ok(bytes),
EncodedPost::Stolen =>
panic!("EncodedPost::Stolen"),
}
}
}
#[derive(Debug)]
pub struct Feed {
pub posts: Vec<EncodablePost>,
}
impl Feed {
/// Analyze time intervals between posts to estimate when to fetch
/// next
#[must_use] pub fn mean_post_interval(&self) -> Option<Duration> {
let mut timestamps = self.posts.iter()
.filter_map(|post| post.timestamp())
.collect::<Vec<_>>();
timestamps.sort();
if timestamps.len() > 2 {
Some(
((*timestamps.last().unwrap() - timestamps[0]) / (timestamps.len() as i32 - 1)
).to_std().unwrap()
)
} else {
None
}
}
pub async fn fetch(client: &reqwest::Client, url: &str) -> Result<Self, reqwest::Error> {
let body = client.get(url)
.send()
.await?
.bytes()
.await?;
let posts = tokio::task::spawn_blocking(move || {
let values: Vec<serde_json::Value> = serde_json::from_slice(&body)?;
let posts: Vec<EncodablePost> = values.into_iter()
.filter_map(|value| EncodablePost::from_value("update".to_string(), value).ok())
.collect();
Ok::<_, serde_json::Error>(posts)
}).await.expect("join blocking")
.unwrap_or_else(|e| {
tracing::error!("{}", e);
vec![]
});
tracing::trace!("{} {} posts", url, posts.len());
Ok(Feed { posts })
}
pub async fn stream(client: &reqwest::Client, url: &str) -> Result<impl Stream<Item = EncodablePost>, StreamError> {
let res = client.get(url)
.timeout(Duration::MAX)
.send()
.await
.map_err(StreamError::Http)?;
if res.status() != 200 {
return Err(StreamError::HttpStatus(res.status()));
}
let ct = res.headers().get("content-type")
.and_then(|c| c.to_str().ok());
if ct.map_or(true, |ct| ct != "text/event-stream") {
return Err(StreamError::InvalidContentType(ct.unwrap_or("").to_owned()));
}
let src = res.bytes_stream().eventsource()
.filter_map(|result| async move {
let event = result.ok()?;
EncodablePost::from_bytes(event.event, event.data.into_bytes()).ok()
});
Ok(src)
}
}
pub enum StreamError {
HttpStatus(StatusCode),
Http(reqwest::Error),
InvalidContentType(String),
}
impl std::fmt::Display for StreamError {
fn fmt(&self, fmt: &mut std::fmt::Formatter) -> Result<(), std::fmt::Error> {
match self {
StreamError::HttpStatus(code) =>
write!(fmt, "HTTP/{code}"),
StreamError::Http(e) =>
e.fmt(fmt),
StreamError::InvalidContentType(ct) =>
write!(fmt, "Invalid Content-Type: {ct}"),
}
}
}