mirror of
https://gitlab.com/xmpp-rs/xmpp-rs.git
synced 2024-06-12 03:04:03 +02:00
497 lines
14 KiB
Rust
497 lines
14 KiB
Rust
//! Parsed XML token
|
|
|
|
use nom::{
|
|
branch::alt,
|
|
bytes::streaming::{tag, take_while1},
|
|
character::{
|
|
is_space,
|
|
streaming::{char, digit1, one_of, space0, space1},
|
|
},
|
|
combinator::{not, peek, value},
|
|
multi::many0,
|
|
number::streaming::hex_u32,
|
|
IResult,
|
|
};
|
|
use std::borrow::Cow;
|
|
|
|
/// Attribute name with prefix
|
|
#[derive(Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)]
|
|
pub struct LocalName {
|
|
/// Element/attribute prefix
|
|
pub prefix: Option<String>,
|
|
/// Element/attribute name
|
|
pub name: String,
|
|
}
|
|
|
|
impl From<&str> for LocalName {
|
|
fn from(s: &str) -> Self {
|
|
match s.split_once(':') {
|
|
Some((prefix, name)) => LocalName {
|
|
prefix: Some(prefix.to_owned()),
|
|
name: name.to_owned(),
|
|
},
|
|
None => LocalName {
|
|
prefix: None,
|
|
name: s.to_owned(),
|
|
},
|
|
}
|
|
}
|
|
}
|
|
|
|
/// Name-value pair of an element's attribute
|
|
#[derive(Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)]
|
|
pub struct Attribute {
|
|
/// Attribute name
|
|
pub name: LocalName,
|
|
/// Attribute value
|
|
pub value: String,
|
|
}
|
|
|
|
/// Parsed XML token
|
|
#[derive(Clone, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)]
|
|
pub enum Token {
|
|
/// XML declaration `<?xml version="1.0"?>`
|
|
XmlDecl {
|
|
/// List of attributes
|
|
attrs: Vec<Attribute>,
|
|
},
|
|
/// XML element opening tag
|
|
StartTag {
|
|
/// Element name
|
|
name: LocalName,
|
|
/// List of attributes
|
|
attrs: Vec<Attribute>,
|
|
/// Is this tag self-closing (`/>`)?
|
|
self_closing: bool,
|
|
},
|
|
/// XML element closing tag
|
|
EndTag {
|
|
/// Element name
|
|
name: LocalName,
|
|
},
|
|
/// Child text
|
|
Text(String),
|
|
}
|
|
|
|
impl Token {
|
|
/// Parse one token
|
|
pub fn parse(s: &[u8]) -> IResult<&[u8], Token> {
|
|
alt((Self::parse_tag, |s| {
|
|
let (s, _) = not(peek(char('<')))(s)?;
|
|
let (s, text) = Self::parse_text('<', s)?;
|
|
Ok((s, Token::Text(text.into_owned())))
|
|
}))(s)
|
|
}
|
|
|
|
fn parse_tag(s: &[u8]) -> IResult<&[u8], Token> {
|
|
let (s, _) = tag("<")(s)?;
|
|
alt((
|
|
|s| -> IResult<&[u8], Token> {
|
|
// CDATA
|
|
let (s, _) = tag("![CDATA[")(s)?;
|
|
let mut end = None;
|
|
for i in 0..s.len() - 2 {
|
|
if &s[i..i + 3] == b"]]>" {
|
|
end = Some(i);
|
|
break;
|
|
}
|
|
}
|
|
if let Some(end) = end {
|
|
let text = Self::str_from_utf8(&s[..end])?;
|
|
Ok((&s[end + 3..], Token::Text(text.to_string())))
|
|
} else {
|
|
Err(nom::Err::Incomplete(nom::Needed::Unknown))
|
|
}
|
|
},
|
|
|s| {
|
|
// XmlDecl
|
|
let (s, _) = tag("?xml")(s)?;
|
|
let (s, _) = space1(s)?;
|
|
|
|
let (s, attrs) = many0(|s| {
|
|
let (s, (name, value)) = Self::parse_attr(s)?;
|
|
let (s, _) = space0(s)?;
|
|
Ok((s, (name, value)))
|
|
})(s)?;
|
|
|
|
let (s, _) = space0(s)?;
|
|
let (s, _) = tag("?>")(s)?;
|
|
Ok((
|
|
s,
|
|
Token::XmlDecl {
|
|
attrs: attrs
|
|
.into_iter()
|
|
.map(|(name, value)| Attribute {
|
|
name: name.into(),
|
|
value: value.into_owned(),
|
|
})
|
|
.collect(),
|
|
},
|
|
))
|
|
},
|
|
|s| {
|
|
// EndTag
|
|
let (s, _) = tag("/")(s)?;
|
|
let (s, _) = space0(s)?;
|
|
let (s, name) = take_while1(|b| !(is_space(b) || b == b'>'))(s)?;
|
|
let (s, _) = space0(s)?;
|
|
let (s, _) = tag(">")(s)?;
|
|
let name = Self::str_from_utf8(name)?;
|
|
Ok((s, Token::EndTag { name: name.into() }))
|
|
},
|
|
|s| {
|
|
// StartTag
|
|
let (s, _) = space0(s)?;
|
|
let (s, name) = take_while1(|b| !(is_space(b) || b == b'>' || b == b'/'))(s)?;
|
|
let (s, _) = space0(s)?;
|
|
let (s, attrs) = many0(|s| {
|
|
let (s, (name, value)) = Self::parse_attr(s)?;
|
|
let (s, _) = space0(s)?;
|
|
Ok((s, (name, value)))
|
|
})(s)?;
|
|
|
|
let (s, self_closing) = alt((
|
|
|s| {
|
|
let (s, _) = tag("/")(s)?;
|
|
let (s, _) = space0(s)?;
|
|
let (s, _) = tag(">")(s)?;
|
|
Ok((s, true))
|
|
},
|
|
|s| {
|
|
let (s, _) = tag(">")(s)?;
|
|
Ok((s, false))
|
|
},
|
|
))(s)?;
|
|
|
|
Ok((
|
|
s,
|
|
Token::StartTag {
|
|
name: Self::str_from_utf8(name)?.into(),
|
|
attrs: attrs
|
|
.into_iter()
|
|
.map(|(name, value)| Attribute {
|
|
name: name.into(),
|
|
value: value.into_owned(),
|
|
})
|
|
.collect(),
|
|
self_closing,
|
|
},
|
|
))
|
|
},
|
|
))(s)
|
|
}
|
|
|
|
fn parse_attr(s: &[u8]) -> IResult<&[u8], (&str, Cow<str>)> {
|
|
let (s, name) = take_while1(|b| !(is_space(b) || b == b'=' || b == b'/' || b == b'>'))(s)?;
|
|
let name = Self::str_from_utf8(name)?;
|
|
let (s, _) = space0(s)?;
|
|
let (s, _) = tag("=")(s)?;
|
|
let (s, _) = space0(s)?;
|
|
let (s, delim) = one_of("'\"")(s)?;
|
|
let (s, value) = Self::parse_text(delim, s)?;
|
|
let value = Self::normalize_attribute_value(value);
|
|
let (s, _) = char(delim)(s)?;
|
|
Ok((s, (name, value)))
|
|
}
|
|
|
|
fn parse_text(until: char, s: &[u8]) -> IResult<&[u8], Cow<str>> {
|
|
let (s, results) = many0(alt((
|
|
|s| {
|
|
let (s, _) = tag("&#")(s)?;
|
|
let (s, num) = digit1(s)?;
|
|
let (s, _) = char(';')(s)?;
|
|
let num: u32 = Self::str_from_utf8(num)?.parse().map_err(|_| {
|
|
nom::Err::Failure(nom::error::Error::new(s, nom::error::ErrorKind::Fail))
|
|
})?;
|
|
if let Some(c) = std::char::from_u32(num) {
|
|
Ok((s, Cow::from(format!("{}", c))))
|
|
} else {
|
|
Ok((s, Cow::from(format!(""))))
|
|
}
|
|
},
|
|
|s| {
|
|
let (s, _) = tag("&#x")(s)?;
|
|
let (s, num) = hex_u32(s)?;
|
|
let (s, _) = char(';')(s)?;
|
|
if let Some(c) = std::char::from_u32(num) {
|
|
Ok((s, Cow::from(format!("{}", c))))
|
|
} else {
|
|
Ok((s, Cow::from(format!(""))))
|
|
}
|
|
},
|
|
|s| {
|
|
let (s, _) = char('&')(s)?;
|
|
let (s, c) = alt((
|
|
value('&', tag("amp")),
|
|
value('<', tag("lt")),
|
|
value('>', tag("gt")),
|
|
value('"', tag("quot")),
|
|
value('\'', tag("apos")),
|
|
))(s)?;
|
|
let (s, _) = char(';')(s)?;
|
|
Ok((s, Cow::from(format!("{}", c))))
|
|
},
|
|
|s| {
|
|
let (s, _) = not(peek(char(until)))(s)?;
|
|
let (s, text) =
|
|
take_while1(|b| b != until as u8 && b != b'&' && b != b'<' && b != b'>')(s)?;
|
|
let text = Self::str_from_utf8(text)?;
|
|
let text = Self::normalize_newlines(text);
|
|
Ok((s, text))
|
|
},
|
|
)))(s)?;
|
|
|
|
if results.len() == 1 {
|
|
Ok((s, results.into_iter().next().unwrap()))
|
|
} else {
|
|
let result = results.join("");
|
|
Ok((s, Cow::from(result)))
|
|
}
|
|
}
|
|
|
|
fn str_from_utf8(s: &[u8]) -> Result<&str, nom::Err<nom::error::Error<&[u8]>>> {
|
|
std::str::from_utf8(s)
|
|
.map_err(|_| nom::Err::Failure(nom::error::Error::new(s, nom::error::ErrorKind::Fail)))
|
|
}
|
|
|
|
/// https://www.w3.org/TR/2008/REC-xml-20081126/#sec-line-ends
|
|
fn normalize_newlines(s: &str) -> Cow<str> {
|
|
let mut s = Cow::from(s);
|
|
if s.find("\r\n").is_some() {
|
|
s = Cow::from(s.replace("\r\n", "\n"));
|
|
}
|
|
if s.find("\r").is_some() {
|
|
s = Cow::from(s.replace("\r", "\n"));
|
|
}
|
|
s
|
|
}
|
|
|
|
/// https://www.w3.org/TR/2008/REC-xml-20081126/#AVNormalize
|
|
///
|
|
/// assumes normalize_newlines() already done
|
|
fn normalize_attribute_value(mut s: Cow<str>) -> Cow<str> {
|
|
if s.find("\t").is_some() || s.find("\n").is_some() {
|
|
s = Cow::from(s.replace(|c| c == '\t' || c == '\n', " "));
|
|
}
|
|
s
|
|
}
|
|
}
|
|
|
|
#[cfg(test)]
|
|
mod tests {
|
|
use super::*;
|
|
|
|
fn attr(name: &str, value: &str) -> Attribute {
|
|
Attribute {
|
|
name: name.into(),
|
|
value: value.to_owned(),
|
|
}
|
|
}
|
|
|
|
#[test]
|
|
fn test_text() {
|
|
assert_eq!(
|
|
Ok((&b"</x"[..], Token::Text("foobar".to_string()))),
|
|
Token::parse(b"foobar</x")
|
|
);
|
|
}
|
|
|
|
#[test]
|
|
fn test_newlines() {
|
|
assert_eq!(
|
|
Ok((&b"</x"[..], Token::Text("a\nb\nc\nd".to_string()))),
|
|
Token::parse(b"a\nb\rc\r\nd</x")
|
|
);
|
|
}
|
|
|
|
#[test]
|
|
fn test_text_entities() {
|
|
assert_eq!(
|
|
Ok((&b"</x"[..], Token::Text("\"<foo&bar>'".to_string()))),
|
|
Token::parse(b""<foo&bar>'</x")
|
|
);
|
|
}
|
|
|
|
#[test]
|
|
fn test_text_entities_decimal() {
|
|
assert_eq!(
|
|
Ok((&b"</x"[..], Token::Text("foo\r\n".to_string()))),
|
|
Token::parse(b"foo </x")
|
|
);
|
|
}
|
|
|
|
#[test]
|
|
fn test_text_entities_hexadecimal() {
|
|
assert_eq!(
|
|
Ok((&b"</x"[..], Token::Text("foo\r\n".to_string()))),
|
|
Token::parse(b"foo
</x")
|
|
);
|
|
}
|
|
|
|
#[test]
|
|
fn test_cdata() {
|
|
assert_eq!(
|
|
Ok((&b""[..], Token::Text("<a href='>".to_string()))),
|
|
Token::parse(b"<![CDATA[<a href='>]]>")
|
|
);
|
|
}
|
|
|
|
#[test]
|
|
fn test_tag() {
|
|
assert_eq!(
|
|
Ok((
|
|
&b""[..],
|
|
Token::StartTag {
|
|
name: "foobar".into(),
|
|
attrs: vec![],
|
|
self_closing: false,
|
|
}
|
|
)),
|
|
Token::parse(b"<foobar>")
|
|
);
|
|
}
|
|
|
|
#[test]
|
|
fn test_attrs() {
|
|
assert_eq!(
|
|
Ok((
|
|
&b""[..],
|
|
Token::StartTag {
|
|
name: "a".into(),
|
|
attrs: vec![attr("a", "2'3"), attr("b", "4\"2"), attr("c", ""),],
|
|
self_closing: false,
|
|
}
|
|
)),
|
|
Token::parse(b"<a a=\"2'3\" b = '4\"2' c = ''>")
|
|
);
|
|
}
|
|
|
|
#[test]
|
|
fn test_attrs_normalized() {
|
|
assert_eq!(
|
|
Ok((
|
|
&b""[..],
|
|
Token::StartTag {
|
|
name: "a".into(),
|
|
attrs: vec![attr("a", "x y"), attr("b", " "), attr("c", "a b"),],
|
|
self_closing: false,
|
|
}
|
|
)),
|
|
Token::parse(b"<a a=\"x\ty\" b = '\r\n' c = 'a\r\rb'>")
|
|
);
|
|
}
|
|
|
|
#[test]
|
|
fn test_attrs_entities() {
|
|
assert_eq!(
|
|
Ok((
|
|
&b""[..],
|
|
Token::StartTag {
|
|
name: "a".into(),
|
|
attrs: vec![attr("a", "<3"),],
|
|
self_closing: false,
|
|
}
|
|
)),
|
|
Token::parse(b"<a a='<3'>")
|
|
);
|
|
}
|
|
|
|
#[test]
|
|
fn test_self_closing_tag() {
|
|
assert_eq!(
|
|
Ok((
|
|
&b""[..],
|
|
Token::StartTag {
|
|
name: "foobar".into(),
|
|
attrs: vec![],
|
|
self_closing: true,
|
|
}
|
|
)),
|
|
Token::parse(b"<foobar/>")
|
|
);
|
|
}
|
|
|
|
#[test]
|
|
fn test_end_tag() {
|
|
assert_eq!(
|
|
Ok((
|
|
&b""[..],
|
|
Token::EndTag {
|
|
name: "foobar".into(),
|
|
}
|
|
)),
|
|
Token::parse(b"</foobar>")
|
|
);
|
|
}
|
|
|
|
#[test]
|
|
fn test_element_prefix() {
|
|
assert_eq!(
|
|
Ok((
|
|
&b""[..],
|
|
Token::StartTag {
|
|
name: LocalName {
|
|
name: "z".to_owned(),
|
|
prefix: Some("x".to_owned()),
|
|
},
|
|
attrs: vec![],
|
|
self_closing: true,
|
|
}
|
|
)),
|
|
Token::parse(b"<x:z/>")
|
|
);
|
|
}
|
|
|
|
#[test]
|
|
fn test_attr_prefix() {
|
|
assert_eq!(
|
|
Ok((
|
|
&b""[..],
|
|
Token::StartTag {
|
|
name: "a".into(),
|
|
attrs: vec![Attribute {
|
|
name: LocalName {
|
|
name: "abc".to_owned(),
|
|
prefix: Some("xyz".to_owned()),
|
|
},
|
|
value: "".to_owned(),
|
|
}],
|
|
self_closing: false,
|
|
}
|
|
)),
|
|
Token::parse(b"<a xyz:abc=''>")
|
|
);
|
|
}
|
|
|
|
#[test]
|
|
fn test_xml_decl() {
|
|
assert_eq!(
|
|
Ok((
|
|
&b""[..],
|
|
Token::XmlDecl {
|
|
attrs: vec![
|
|
Attribute {
|
|
name: LocalName {
|
|
name: "version".to_owned(),
|
|
prefix: None,
|
|
},
|
|
value: "1.0".to_owned(),
|
|
},
|
|
Attribute {
|
|
name: LocalName {
|
|
name: "encoding".to_owned(),
|
|
prefix: None,
|
|
},
|
|
value: "UTF-8".to_owned(),
|
|
}
|
|
],
|
|
}
|
|
)),
|
|
Token::parse(b"<?xml version='1.0' encoding=\"UTF-8\"?>")
|
|
);
|
|
}
|
|
|
|
// TODO:
|
|
// - DOCTYPE
|
|
}
|