From 9f00f50e520bc8155db52b8c4977a429993a754c Mon Sep 17 00:00:00 2001 From: Ryan Kavanagh Date: Sat, 30 May 2020 14:01:16 -0400 Subject: Initial import of a text/gemini parser --- parse_gemini.ml | 105 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 105 insertions(+) create mode 100644 parse_gemini.ml (limited to 'parse_gemini.ml') diff --git a/parse_gemini.ml b/parse_gemini.ml new file mode 100644 index 0000000..2246a90 --- /dev/null +++ b/parse_gemini.ml @@ -0,0 +1,105 @@ +(* Copyright (C) 2020 Ryan Kavanagh *) +(* Implements a parser for the text/gemini mime-type specified on + https://gemini.circumlunar.space/docs/spec-spec.txt . *) + +open Angstrom + +type gemini_line = + | H1 of string (* # header *) + | H2 of string (* ## header *) + | H3 of string (* ### header *) + | Link of string * (string option) (* link with url and optional name *) + | Pre of string (* preformatted text line *) + | Text of string (* plain text line *) + | Ul of string (* unordered list item *) + +type gemini = gemini_line list + +let is_space = function + | ' ' | '\t' -> true + | _ -> false + +let is_cr = function + | '\r' -> true + | _ -> false + +let skip_spaces = skip_while is_space + +let take_till_cr = take_till is_cr <* string "\r\n" + +let header = + lift (fun x -> H3 x) (string "###" *> skip_spaces *> take_till_cr) + <|> lift (fun x -> H2 x) (string "##" *> skip_spaces *> take_till_cr) + <|> lift (fun x -> H1 x) (string "#" *> skip_spaces *> take_till_cr) + "gemini_header" + +let link = + let end_of_url b = is_space b || is_cr b in + let to_link s = function + | "" -> Link (s, None) + | n -> Link (s, Some n) + in + lift2 to_link + (string "=>" *> skip_spaces *> take_till end_of_url) + (* the optional name, will be "" if there is none *) + (skip_spaces *> take_till_cr) + "gemini_link" + +(* Makes sure a line is actually a plain text line *) +let text = + let take_line = lift (fun s -> Text s) take_till_cr in + peek_char >>= function + | Some '#' -> fail "text line cannot start with #" + | Some '*' -> fail "text line cannot start with *" + | Some '`' -> (peek_string 3 >>= function + | "```" -> fail "text line cannot start with ```" + | _ -> take_line) + | Some '=' -> (peek_char >>= function + | Some '>' -> fail "text line cannot start with =>" + | _ -> take_line) + | _ -> take_line + +(* The gemini spec says that pre blocks are demarcated by lines that + start with "```". It says nothing about what follows on these + lines. *) +let pre_divider = string "```" *> take_till_cr *> return None + +let pre_line = peek_string 3 + >>= function + | "```" -> fail "preformatted line cannot start with ```" + | _ -> lift (fun s -> Pre s) take_till_cr + +let pre_block = pre_divider *> many pre_line <* pre_divider + +let list_item = lift (fun x -> Ul x) (string "*" *> skip_spaces *> take_till_cr) + +let gemini : gemini Angstrom.t = + (* Because many tries each in order, we know that we will have a + plain text line whenever we get to the text parser. As a result, + we don't need to recheck that it's actually a text line and not + some special line. *) + let plain_text = lift (fun s -> Text s) take_till_cr + in + fix (fun gemini -> + lift2 List.append pre_block gemini + <|> lift2 List.cons (header <|> link <|> list_item <|> plain_text) gemini + <|> return []) + "gemini" + +let gemini_line_to_str = function + | H1 s -> "# " ^ s + | H2 s -> "## " ^ s + | H3 s -> "### " ^ s + | Link (s, Some n) -> "=> " ^ s ^ " " ^ n + | Link (s, None) -> "=> " ^ s + | Pre s -> "P " ^ s + | Text s -> "T " ^ s + | Ul s -> "* " ^ s + +let gemini_to_str l = + List.fold_right (fun s -> fun r -> (gemini_line_to_str s) ^ "\n" ^ r) l "" + +let test_parse p str = + match parse_string ~consume:All (p >>= return) str with + | Ok x -> gemini_to_str x + | _ -> "" -- cgit v1.2.3