caveman/cave/src/feed.rs

268 lines
7.5 KiB
Rust
Raw Normal View History

2022-12-02 22:02:37 +01:00
use std::{collections::{HashMap, HashSet}, time::Duration, ops::Deref};
2022-11-02 22:42:43 +01:00
use chrono::{DateTime, FixedOffset};
2022-11-11 21:52:52 +01:00
use futures::{Stream, StreamExt};
use eventsource_stream::Eventsource;
2022-11-02 22:42:43 +01:00
2022-11-15 00:45:02 +01:00
#[derive(Debug, serde::Serialize, serde::Deserialize)]
2022-11-02 21:12:16 +01:00
pub struct Account {
pub username: String,
pub display_name: String,
pub url: String,
pub bot: bool,
pub avatar: String,
pub header: String,
}
impl Account {
pub fn host(&self) -> Option<String> {
reqwest::Url::parse(&self.url)
.ok()
.and_then(|url| url.domain()
2022-11-03 17:37:06 +01:00
.map(|s| s.to_lowercase())
2022-11-02 21:12:16 +01:00
)
}
}
2022-11-15 00:45:02 +01:00
#[derive(Debug, serde::Serialize, serde::Deserialize)]
2022-11-02 21:12:16 +01:00
pub struct Tag {
pub name: String,
}
2022-11-15 00:45:02 +01:00
#[derive(Debug, serde::Serialize, serde::Deserialize)]
2022-11-02 21:12:16 +01:00
pub struct Application {
pub name: String,
pub website: Option<String>,
2022-11-02 21:12:16 +01:00
}
2022-11-15 00:45:02 +01:00
#[derive(Debug, serde::Serialize, serde::Deserialize)]
2022-11-03 03:42:13 +01:00
pub struct Mention {
pub username: Option<String>,
pub url: String,
pub acct: Option<String>,
}
impl Mention {
pub fn user_host(&self) -> Option<String> {
reqwest::Url::parse(&self.url)
.ok()
.and_then(|url| url.domain()
2022-11-03 17:37:06 +01:00
.map(|host| host.to_lowercase())
2022-11-03 03:42:13 +01:00
)
}
}
2022-11-25 02:43:28 +01:00
#[derive(Debug, serde::Serialize, serde::Deserialize)]
pub struct MediaAttachment {
#[serde(rename = "type")]
pub media_type: String,
pub remote_url: Option<String>,
}
2022-11-15 00:45:02 +01:00
#[derive(Debug, serde::Serialize, serde::Deserialize)]
2022-11-02 21:12:16 +01:00
pub struct Post {
pub created_at: String,
2022-11-04 15:50:00 +01:00
pub uri: String,
2022-11-25 02:43:28 +01:00
#[serde(default = "String::new")]
2022-11-02 21:12:16 +01:00
pub content: String,
pub account: Account,
2022-11-25 02:43:28 +01:00
#[serde(default)]
2022-11-02 21:12:16 +01:00
pub tags: Vec<Tag>,
pub application: Option<Application>,
pub sensitive: Option<bool>,
2022-11-25 02:43:28 +01:00
#[serde(default)]
2022-11-03 03:42:13 +01:00
pub mentions: Vec<Mention>,
2022-11-03 17:13:03 +01:00
pub language: Option<String>,
2022-11-25 02:43:28 +01:00
#[serde(default)]
pub media_attachments: Vec<MediaAttachment>,
#[serde(default)]
pub reblog: Option<Box<Post>>,
2022-11-02 21:12:16 +01:00
}
impl Post {
2022-11-04 15:50:00 +01:00
pub fn uri_host(&self) -> Option<String> {
reqwest::Url::parse(&self.uri)
2022-11-02 21:49:37 +01:00
.ok()
2022-11-04 15:50:00 +01:00
.and_then(|uri| uri.domain()
2022-11-03 16:17:04 +01:00
.map(|host| host.to_owned())
)
2022-11-02 21:49:37 +01:00
}
2022-11-02 22:42:43 +01:00
pub fn user_id(&self) -> Option<String> {
let username = self.account.username.to_lowercase();
let host = self.uri_host()?;
Some(format!("{}@{}", username, host))
}
2022-11-02 22:42:43 +01:00
pub fn timestamp(&self) -> Option<DateTime<FixedOffset>> {
DateTime::parse_from_rfc3339(&self.created_at)
.ok()
}
2022-11-08 00:43:46 +01:00
/// clip "en-us" to "en"
pub fn lang(&self) -> Option<String> {
let language = match &self.language {
Some(language) => language,
None => return None,
};
if language.len() < 2 {
None
} else if language.len() == 2 {
Some(language.to_lowercase())
} else {
Some(language[..2].to_lowercase())
}
}
pub fn tags_set(&self) -> HashMap<String, HashSet<String>> {
let mut result: HashMap<String, HashSet<String>> = HashMap::with_capacity(self.tags.len());
for tag in &self.tags {
let name = tag.name.to_lowercase();
if name.contains(char::is_whitespace) {
continue;
}
match result.entry(name) {
std::collections::hash_map::Entry::Vacant(entry) => {
let mut r = HashSet::new();
r.insert(tag.name.clone());
entry.insert(r);
}
std::collections::hash_map::Entry::Occupied(mut entry) => {
entry.get_mut().insert(tag.name.clone());
}
}
}
result
}
2022-11-02 21:12:16 +01:00
}
2022-12-02 23:05:35 +01:00
#[derive(Debug)]
enum EncodedPost {
Value(serde_json::Value),
Bytes(Vec<u8>),
Stolen,
}
2022-12-02 22:02:37 +01:00
/// Wraps a `Post` along with a serializable form that is most close
/// to the original incoming data
#[derive(Debug)]
pub struct EncodablePost {
2022-12-02 23:05:35 +01:00
post: Post,
encoded: EncodedPost,
2022-12-02 22:02:37 +01:00
}
impl Deref for EncodablePost {
type Target = Post;
fn deref(&self) -> &Self::Target {
&self.post
}
}
impl EncodablePost {
2022-12-02 23:05:35 +01:00
pub fn from_value(value: serde_json::Value) -> Result<Self, serde_json::Error> {
let post = serde_json::from_value(value.clone())?;
Ok(EncodablePost {
post,
encoded: EncodedPost::Value(value),
})
}
pub fn from_bytes(bytes: Vec<u8>) -> Result<Self, serde_json::Error> {
let post = serde_json::from_slice(&bytes)?;
Ok(EncodablePost {
post,
encoded: EncodedPost::Bytes(bytes),
})
}
2022-12-02 22:02:37 +01:00
pub fn encode(&mut self) -> Result<Vec<u8>, serde_json::Error> {
2022-12-02 23:05:35 +01:00
use std::mem::replace;
let encoded = replace(&mut self.encoded, EncodedPost::Stolen);
match encoded {
EncodedPost::Value(value) =>
serde_json::to_vec(&value),
EncodedPost::Bytes(bytes) =>
Ok(bytes),
EncodedPost::Stolen =>
panic!("EncodedPost::Stolen"),
2022-12-02 22:02:37 +01:00
}
}
}
2022-11-02 21:12:16 +01:00
#[derive(Debug)]
pub struct Feed {
2022-12-02 22:02:37 +01:00
pub posts: Vec<EncodablePost>,
2022-11-02 21:12:16 +01:00
}
impl Feed {
/// Analyze time intervals between posts to estimate when to fetch
/// next
pub fn mean_post_interval(&self) -> Option<Duration> {
let mut timestamps = self.posts.iter()
.filter_map(|post| post.timestamp())
.collect::<Vec<_>>();
timestamps.sort();
if timestamps.len() > 2 {
Some(
((*timestamps.last().unwrap() - timestamps[0]) / (timestamps.len() as i32 - 1)
).to_std().unwrap()
)
} else {
None
}
}
2022-11-02 21:12:16 +01:00
pub async fn fetch(client: &reqwest::Client, url: &str) -> Result<Self, reqwest::Error> {
2022-12-02 22:02:37 +01:00
let body = client.get(url)
2022-11-02 21:12:16 +01:00
.send()
.await?
2022-12-02 22:02:37 +01:00
.bytes()
2022-11-02 21:12:16 +01:00
.await?;
2022-12-02 22:02:37 +01:00
let posts = tokio::task::spawn_blocking(move || {
let values: Vec<serde_json::Value> = serde_json::from_slice(&body)?;
let posts: Vec<EncodablePost> = values.into_iter()
2022-12-02 23:05:35 +01:00
.filter_map(|value| EncodablePost::from_value(value).ok())
2022-12-02 22:02:37 +01:00
.collect();
Ok::<_, serde_json::Error>(posts)
}).await.expect("join blocking")
.unwrap_or_else(|e| {
2022-12-01 01:39:38 +01:00
tracing::error!("{}", e);
2022-12-02 22:02:37 +01:00
vec![]
});
2022-12-01 01:39:38 +01:00
tracing::trace!("{} {} posts", url, posts.len());
2022-11-02 21:12:16 +01:00
Ok(Feed { posts })
}
2022-12-02 22:02:37 +01:00
pub async fn stream(client: &reqwest::Client, url: &str) -> Result<impl Stream<Item = EncodablePost>, String> {
let res = client.get(url)
2022-11-11 21:52:52 +01:00
.timeout(Duration::MAX)
.send()
2022-11-11 21:52:52 +01:00
.await
.map_err(|e| format!("{}", e))?;
if res.status() != 200 {
return Err(format!("HTTP {}", res.status()));
}
let ct = res.headers().get("content-type")
.and_then(|c| c.to_str().ok());
if ct.map_or(true, |ct| ct != "text/event-stream") {
return Err(format!("Invalid Content-Type: {:?}", ct));
}
let src = res.bytes_stream().eventsource()
.filter_map(|result| async {
let result = result.ok()?;
if result.event == "update" {
Some(result)
} else {
None
}
})
.filter_map(|event| async move {
2022-12-02 23:05:35 +01:00
EncodablePost::from_bytes(event.data.into_bytes()).ok()
2022-11-11 21:52:52 +01:00
});
Ok(src)
}
2022-11-02 21:12:16 +01:00
}