use std::mem::replace; #[derive(Clone, Copy, PartialEq, Debug)] enum ByteState { Char, Newline, } #[derive(Clone, Copy, PartialEq, Debug)] enum LineState { Key, PropName, PropValue, Value, ValueEscape, } #[derive(Debug, PartialEq)] pub enum Token { Key(String), PropName(String), PropValue(String), Value(String), } #[derive(Debug)] pub struct Tokenizer { byte_state: ByteState, line_state: LineState, buffer: Vec, } impl Tokenizer { pub fn new() -> Self { Tokenizer { byte_state: ByteState::Char, line_state: LineState::Key, buffer: vec![], } } pub fn feed(&mut self, input: &'_ [u8], mut f: F) where F: FnMut(Token), { for b in input { let bs = match (self.byte_state, *b as char) { (_, '\r') => [None; 2], (ByteState::Char, '\n') => { self.byte_state = ByteState::Newline; [None; 2] } (ByteState::Char, _) => [Some(*b), None], (ByteState::Newline, ' ') => { self.byte_state = ByteState::Char; [None; 2] } (ByteState::Newline, _) => { self.byte_state = ByteState::Char; [Some('\n' as u8), Some(*b)] } }; for b in bs.iter().filter_map(|b| *b) { match (self.line_state, b as char) { (_, '\r') => {} (LineState::Key, ':') => { let buffer = replace(&mut self.buffer, vec![]); match String::from_utf8(buffer) { Ok(s) => f(Token::Key(s)), Err(e) => println!("UTF8 error: {:?}", e), } self.line_state = LineState::Value; } (LineState::Key, '\n') => { if self.buffer.len() > 0 { println!("Key without value: {:?}", self.buffer); self.buffer = vec![]; } self.line_state = LineState::Key; } (LineState::Key, ';') => { let buffer = replace(&mut self.buffer, vec![]); match String::from_utf8(buffer) { Ok(s) => f(Token::Key(s)), Err(e) => println!("UTF8 error: {:?}", e), } self.line_state = LineState::PropName; } (LineState::PropName, '=') => { let buffer = replace(&mut self.buffer, vec![]); match String::from_utf8(buffer) { Ok(s) => f(Token::PropName(s)), Err(e) => println!("UTF8 error: {:?}", e), } self.line_state = LineState::PropValue; } (LineState::PropName, ':') => { let buffer = replace(&mut self.buffer, vec![]); match String::from_utf8(buffer) { Ok(s) => f(Token::PropName(s)), Err(e) => println!("UTF8 error: {:?}", e), } self.line_state = LineState::Value; } (LineState::PropValue, ':') => { let buffer = replace(&mut self.buffer, vec![]); match String::from_utf8(buffer) { Ok(s) => f(Token::PropValue(s)), Err(e) => println!("UTF8 error: {:?}", e), } self.line_state = LineState::Value; } (LineState::Value, '\n') => { let buffer = replace(&mut self.buffer, vec![]); match String::from_utf8(buffer) { Ok(s) => f(Token::Value(s)), Err(e) => println!("UTF8 error: {:?}", e), } self.line_state = LineState::Key; } (LineState::Value, '\\') => { self.line_state = LineState::ValueEscape; } (LineState::ValueEscape, 'n') => { self.buffer.push('\n' as u8); self.line_state = LineState::Value; } (LineState::ValueEscape, 'r') => { self.buffer.push('\n' as u8); self.line_state = LineState::Value; } (LineState::ValueEscape, _) => { self.buffer.push(b); self.line_state = LineState::Value; } (_, _) => self.buffer.push(b), } } } } } #[cfg(test)] mod test { use super::*; #[test] fn tokenize_prop() { let mut t = Tokenizer::new(); let mut tokens = vec![]; t.feed(b"DTSTART;TZID=Europe/Berlin:20191121T150000 ", |token| tokens.push(token)); assert_eq!(tokens, vec![ Token::Key("DTSTART".to_owned()), Token::PropName("TZID".to_owned()), Token::PropValue("Europe/Berlin".to_owned()), Token::Value("20191121T150000".to_owned()), ]); } #[test] fn tokenize_event() { let mut t = Tokenizer::new(); let mut tokens = vec![]; t.feed(b"BEGIN:VEVENT SUMMARY:Test event DTSTART:19700101 END:VEVENT ", |token| tokens.push(token)); assert_eq!(tokens, vec![ Token::Key("BEGIN".to_owned()), Token::Value("VEVENT".to_owned()), Token::Key("SUMMARY".to_owned()), Token::Value("Test event".to_owned()), Token::Key("DTSTART".to_owned()), Token::Value("19700101".to_owned()), Token::Key("END".to_owned()), Token::Value("VEVENT".to_owned()), ]); } #[test] fn tokenize_multiline() { let mut t = Tokenizer::new(); let mut tokens = vec![]; t.feed(b"SUMMARY:Hello World ", |token| tokens.push(token)); assert_eq!(tokens, vec![ Token::Key("SUMMARY".to_owned()), Token::Value("HelloWorld".to_owned()), ]); } }