/// This module implements a simple text/gemini parser based on the description /// here: https://gemini.circumlunar.space/docs/specification.html use std::io::{self, Write}; /// Build a gemini document up from a series of nodes. #[derive(Default)] pub struct Builder { nodes: Vec, } impl Builder { pub fn new() -> Builder { Builder::default() } pub fn text>(mut self, data: T) -> Builder { self.nodes.push(Node::Text(data.into())); self } pub fn link>(mut self, to: T, name: Option) -> Builder { self.nodes.push(Node::Link { to: to.into(), name: name, }); self } pub fn preformatted(mut self, alt_text: T, data: T) -> Builder where A: Into, T: Into, { self.nodes.push(Node::Preformatted { alt: alt_text.into(), body: data.into() }); self } pub fn heading>(mut self, level: u8, body: T) -> Builder { self.nodes.push(Node::Heading { level: level, body: body.into(), }); self } pub fn list_item>(mut self, item: T) -> Builder { self.nodes.push(Node::ListItem(item.into())); self } pub fn quote>(mut self, body: T) -> Builder { self.nodes.push(Node::Quote(body.into())); self } pub fn build(self) -> Vec { self.nodes } } impl ToString for Builder { /// Render a document to a string /// /// This produces a text/gemini compliant text document, represented as a string fn to_string(&self) -> String { let len: usize = self.nodes.iter().map(Node::estimate_len).sum(); // sum up node lengths let mut bytes = Vec::with_capacity(len + self.nodes.len()); // add in inter-node newlines render(self, &mut bytes).unwrap(); // Writing to a string shouldn't produce errors unsafe { // This is safe because bytes is composed of Strings. We could have this as // pure safe code by replicating the `render()` method and switching it to use // a fmt::Write (or even `String::push()`)instead of a io::Write, but this has // the same effect, with much DRYer code. String::from_utf8_unchecked(bytes) } } } impl AsRef<[Node]> for Builder { /// Get a reference to the internal node list of this builder fn as_ref(&self) -> &[Node] { self.nodes.as_ref() } } impl AsMut<[Node]> for Builder { /// Get a mutable reference to the internal node list of this builder fn as_mut(&mut self) -> &mut [Node] { self.nodes.as_mut() } } impl From for Vec { /// Convert into a collection of [`Node`]s. /// /// Equivilent to calling [`Builder::build()`] fn from(builder: Builder) -> Self { builder.build() } } /// Render a set of nodes as a document to a writer. pub fn render(nodes: impl AsRef<[Node]>, out: &mut impl Write) -> io::Result<()> { use Node::*; for node in nodes.as_ref() { match node { Text(body) => { let special_prefixes = ["=>", "```", "#", "*", ">"]; if special_prefixes.iter().any(|prefix| body.starts_with(prefix)) { write!(out, " ")?; } write!(out, "{}\n", body)? }, Link { to, name } => match name { Some(name) => write!(out, "=> {} {}\n", to, name)?, None => write!(out, "=> {}\n", to)?, }, Preformatted { alt, body } => write!(out, "```{}\n{}\n```\n", alt, body)?, Heading { level, body } => write!(out, "{} {}\n", "#".repeat(*level as usize), body)?, ListItem(body) => write!(out, "* {}\n", body)?, Quote(body) => write!(out, "> {}\n", body)?, }; } Ok(()) } /// Individual nodes of the document. Each node correlates to a line in the file. #[derive(Debug, PartialEq, Eq, Clone)] pub enum Node { /// Text lines are the most fundamental line type - any line which does not /// match the definition of another line type defined below defaults to /// being a text line. The majority of lines in a typical text/gemini document will be text lines. Text(String), /// Lines beginning with the two characters "=>" are link lines, which have the following syntax: /// /// ```gemini /// =>[][] /// ``` /// /// where: /// /// * `` is any non-zero number of consecutive spaces or tabs /// * Square brackets indicate that the enclosed content is optional. /// * `` is a URL, which may be absolute or relative. If the URL /// does not include a scheme, a scheme of `gemini://` is implied. Link { to: String, name: Option }, /// Any line whose first three characters are "```" (i.e. three consecutive /// back ticks with no leading whitespace) are preformatted toggle lines. /// These lines should NOT be included in the rendered output shown to the /// user. Instead, these lines toggle the parser between preformatted mode /// being "on" or "off". Preformatted mode should be "off" at the beginning /// of a document. The current status of preformatted mode is the only /// internal state a parser is required to maintain. When preformatted mode /// is "on", the usual rules for identifying line types are suspended, and /// all lines should be identified as preformatted text lines (see 5.4.4). /// /// Preformatted text lines should be presented to the user in a "neutral", /// monowidth font without any alteration to whitespace or stylistic /// enhancements. Graphical clients should use scrolling mechanisms to present /// preformatted text lines which are longer than the client viewport, in /// preference to wrapping. In displaying preformatted text lines, clients /// should keep in mind applications like ASCII art and computer source /// code: in particular, source code in languages with significant whitespace /// (e.g. Python) should be able to be copied and pasted from the client into /// a file and interpreted/compiled without any problems arising from the /// client's manner of displaying them. /// /// The first preformatted toggle of a document is often followed by a short /// string, which acts as alt-text for the preformatted block. This is also /// often used to denote the language of code in a block of text. For example, /// a block starting with the text `\`\`\`rust` may be interpreted as rust /// code, and a block starting with `\`\`\` An ascii art owl` would be /// described aptly to visually impaired users using a screen reader. The alt /// text may be separated from the toggle by whitespace. `gemtext` currently /// renders alt text without this separation. /// /// To create a preformatted block with no alt text, simply pass a zero-length /// string as alt text. Preformatted { alt: String, body: String }, /// Lines beginning with "#" are heading lines. Heading lines consist of one, /// two or three consecutive "#" characters, followed by optional whitespace, /// followed by heading text. The number of # characters indicates the "level" /// of header; #, ## and ### can be thought of as analogous to `

`, `

` /// and `

` in HTML. /// /// Heading text should be presented to the user, and clients MAY use special /// formatting, e.g. a larger or bold font, to indicate its status as a header /// (simple clients may simply print the line, including its leading #s, /// without any styling at all). However, the main motivation for the /// definition of heading lines is not stylistic but to provide a /// machine-readable representation of the internal structure of the document. /// Advanced clients can use this information to, e.g. display an automatically /// generated and hierarchically formatted "table of contents" for a long /// document in a side-pane, allowing users to easily jump to specific sections /// without excessive scrolling. CMS-style tools automatically generating menus /// or Atom/RSS feeds for a directory of text/gemini files can use first /// heading in the file as a human-friendly title. Heading { level: u8, body: String }, /// Lines beginning with "* " are unordered list items. This line type exists /// purely for stylistic reasons. The * may be replaced in advanced clients by /// a bullet symbol. Any text after the "* " should be presented to the user as /// if it were a text line, i.e. wrapped to fit the viewport and formatted /// "nicely". Advanced clients can take the space of the bullet symbol into /// account when wrapping long list items to ensure that all lines of text /// corresponding to the item are offset an equal distance from the left of the screen. ListItem(String), /// Lines beginning with ">" are quote lines. This line type exists so that /// advanced clients may use distinct styling to convey to readers the important /// semantic information that certain text is being quoted from an external /// source. For example, when wrapping long lines to the the viewport, each /// resultant line may have a ">" symbol placed at the front. Quote(String), } impl Node { pub fn blank() -> Node { Node::Text("".to_string()) } /// Cheaply estimate the length of this node /// /// This measures length in bytes, *not characters*. So if the user includes /// non-ascii characters, a single one of these characters may add several bytes to /// the length, despite only displaying as one character. /// /// This does include any newlines, but not any trailing newlines. For example, a /// preformatted text block containing a single line reading "trans rights! 🏳️‍⚧️" /// would have a length of 30: 3 backticks, a newline, the text (including 16 bytes /// for the trans flag), another newline, and another 3 backticks. /// /// ``` /// # use gemtext::Node; /// let simple_text = Node::Text(String::from("Henlo worl")); /// let linky_link = Node::Link { to: "gemini://cetacean.club/maj/".to_string(), name: Some("Maj".to_string()) }; /// let human_rights = Node::Preformatted { /// alt: "".to_string(), /// body: "trans rights! 🏳️‍⚧️".to_string(), /// }; /// /// assert_eq!( /// simple_text.estimate_len(), /// "Henlo worl".as_bytes().len() /// ); /// assert_eq!( /// linky_link.estimate_len(), /// "=> gemini://cetacean.club/maj/ Maj".as_bytes().len() /// ); /// assert_eq!( /// human_rights.estimate_len(), /// "```\ntrans rights! 🏳️‍⚧️\n```".as_bytes().len() /// ); /// ``` pub fn estimate_len(&self) -> usize { match self { Self::Text(text) => text.len(), Self::Link { to, name } => 3 + to.as_bytes().len() + name.as_ref().map(|n| n.as_bytes().len() + 1).unwrap_or(0), Self::Preformatted { alt, body } => alt.as_bytes().len() + body.as_bytes().len() + 8, Self::Heading { level, body } => *level as usize + 1 + body.as_bytes().len(), Self::ListItem(item) | Self::Quote(item)=> 2 + item.as_bytes().len(), } } } pub fn parse(doc: &str) -> Vec { let mut result: Vec = vec![]; let mut collect_preformatted: bool = false; let mut preformatted_buffer: Vec = vec![]; let mut alt = ""; for line in doc.lines() { if let Some(trailing) = line.strip_prefix("```") { collect_preformatted = !collect_preformatted; if !collect_preformatted { result.push(Node::Preformatted { alt: alt.to_string(), body: String::from_utf8(preformatted_buffer) .unwrap() .trim_end() .to_string(), }); preformatted_buffer = vec![]; } else { alt = trailing.trim(); } continue; } if collect_preformatted && line != "```" { write!(preformatted_buffer, "{}\n", line).unwrap(); continue; } // Quotes if line.starts_with(">") { result.push(Node::Quote(line[1..].trim().to_string())); continue; } // List items if line.starts_with("*") { result.push(Node::ListItem(line[1..].trim().to_string())); continue; } // Headings if line.starts_with("###") { result.push(Node::Heading { level: 3, body: line[3..].trim().to_string(), }); continue; } if line.starts_with("##") { result.push(Node::Heading { level: 2, body: line[2..].trim().to_string(), }); continue; } if line.starts_with("#") { result.push(Node::Heading { level: 1, body: line[1..].trim().to_string(), }); continue; } // Links if line.starts_with("=>") { let sp = line[2..].split_ascii_whitespace().collect::>(); match sp.len() { 1 => result.push(Node::Link { to: sp[0].trim().to_string(), name: None, }), _ => result.push(Node::Link { to: sp[0].trim().to_string(), name: Some(sp[1..].join(" ").trim().to_string()), }), } continue; } result.push(Node::Text(line.to_string())); } result } #[cfg(test)] mod tests { use super::*; #[test] fn basic() { let _ = pretty_env_logger::try_init(); let msg = include_str!("../../majc/src/help.gmi"); let doc = super::parse(msg); assert_ne!(doc.len(), 0); } #[test] fn quote() { let _ = pretty_env_logger::try_init(); let msg = ">hi there"; let expected: Vec = vec![Node::Quote("hi there".to_string())]; assert_eq!(expected, parse(msg)); } #[test] fn list() { let _ = pretty_env_logger::try_init(); let msg = "*hi there"; let expected: Vec = vec![Node::ListItem("hi there".to_string())]; assert_eq!(expected, parse(msg)); } #[test] fn preformatted() { let _ = pretty_env_logger::try_init(); let msg = "```hi there\n\ obi-wan kenobi\n\ ```\n\ \n\ Test\n"; let expected: Vec = vec![ Node::Preformatted{ alt: "hi there".to_string(), body: "obi-wan kenobi".to_string() }, Node::Text(String::new()), Node::Text("Test".to_string()), ]; assert_eq!(expected, parse(msg)); } #[test] fn header() { let _ = pretty_env_logger::try_init(); let msg = "#hi\n##there\n### my friends"; let expected: Vec = vec![ Node::Heading { level: 1, body: "hi".to_string(), }, Node::Heading { level: 2, body: "there".to_string(), }, Node::Heading { level: 3, body: "my friends".to_string(), }, ]; assert_eq!(expected, parse(msg)); } #[test] fn link() { let _ = pretty_env_logger::try_init(); let msg = "=>/\n=> / Go home"; let expected: Vec = vec![ Node::Link { to: "/".to_string(), name: None, }, Node::Link { to: "/".to_string(), name: Some("Go home".to_string()), }, ]; assert_eq!(expected, parse(msg)); } #[test] fn ambiguous_preformatted() { let _ = pretty_env_logger::try_init(); let msg = include_str!("../../testdata/ambig_preformatted.gmi"); let expected: Vec = vec![ Node::Preformatted { alt: "foo".to_string(), body: "FOO".to_string() }, Node::Text("Foo bar".to_string()), ]; assert_eq!(expected, parse(msg)); } #[test] fn ambiguous_text() { let _ = pretty_env_logger::try_init(); let original = Node::Text("#1 World's Best Coder".to_string()); let expected = " #1 World's Best Coder\n"; let mut rendered: Vec = vec![]; render(vec![original], &mut rendered).unwrap(); let rendered = String::from_utf8(rendered).unwrap(); assert_eq!(expected, rendered) } }