diff options
-rw-r--r-- | LICENSE | 23 | ||||
-rw-r--r-- | parse_gemini.ml | 105 |
2 files changed, 128 insertions, 0 deletions
@@ -0,0 +1,23 @@ +Copyright (c) 2020 Ryan Kavanagh <rak@rak.ac> +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +1. Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/parse_gemini.ml b/parse_gemini.ml new file mode 100644 index 0000000..2246a90 --- /dev/null +++ b/parse_gemini.ml @@ -0,0 +1,105 @@ +(* Copyright (C) 2020 Ryan Kavanagh <rak@rak.ac> *) +(* Implements a parser for the text/gemini mime-type specified on + https://gemini.circumlunar.space/docs/spec-spec.txt . *) + +open Angstrom + +type gemini_line = + | H1 of string (* # header *) + | H2 of string (* ## header *) + | H3 of string (* ### header *) + | Link of string * (string option) (* link with url and optional name *) + | Pre of string (* preformatted text line *) + | Text of string (* plain text line *) + | Ul of string (* unordered list item *) + +type gemini = gemini_line list + +let is_space = function + | ' ' | '\t' -> true + | _ -> false + +let is_cr = function + | '\r' -> true + | _ -> false + +let skip_spaces = skip_while is_space + +let take_till_cr = take_till is_cr <* string "\r\n" + +let header = + lift (fun x -> H3 x) (string "###" *> skip_spaces *> take_till_cr) + <|> lift (fun x -> H2 x) (string "##" *> skip_spaces *> take_till_cr) + <|> lift (fun x -> H1 x) (string "#" *> skip_spaces *> take_till_cr) + <?> "gemini_header" + +let link = + let end_of_url b = is_space b || is_cr b in + let to_link s = function + | "" -> Link (s, None) + | n -> Link (s, Some n) + in + lift2 to_link + (string "=>" *> skip_spaces *> take_till end_of_url) + (* the optional name, will be "" if there is none *) + (skip_spaces *> take_till_cr) + <?> "gemini_link" + +(* Makes sure a line is actually a plain text line *) +let text = + let take_line = lift (fun s -> Text s) take_till_cr in + peek_char >>= function + | Some '#' -> fail "text line cannot start with #" + | Some '*' -> fail "text line cannot start with *" + | Some '`' -> (peek_string 3 >>= function + | "```" -> fail "text line cannot start with ```" + | _ -> take_line) + | Some '=' -> (peek_char >>= function + | Some '>' -> fail "text line cannot start with =>" + | _ -> take_line) + | _ -> take_line + +(* The gemini spec says that pre blocks are demarcated by lines that + start with "```". It says nothing about what follows on these + lines. *) +let pre_divider = string "```" *> take_till_cr *> return None + +let pre_line = peek_string 3 + >>= function + | "```" -> fail "preformatted line cannot start with ```" + | _ -> lift (fun s -> Pre s) take_till_cr + +let pre_block = pre_divider *> many pre_line <* pre_divider + +let list_item = lift (fun x -> Ul x) (string "*" *> skip_spaces *> take_till_cr) + +let gemini : gemini Angstrom.t = + (* Because many tries each in order, we know that we will have a + plain text line whenever we get to the text parser. As a result, + we don't need to recheck that it's actually a text line and not + some special line. *) + let plain_text = lift (fun s -> Text s) take_till_cr + in + fix (fun gemini -> + lift2 List.append pre_block gemini + <|> lift2 List.cons (header <|> link <|> list_item <|> plain_text) gemini + <|> return []) + <?> "gemini" + +let gemini_line_to_str = function + | H1 s -> "# " ^ s + | H2 s -> "## " ^ s + | H3 s -> "### " ^ s + | Link (s, Some n) -> "=> " ^ s ^ " " ^ n + | Link (s, None) -> "=> " ^ s + | Pre s -> "P " ^ s + | Text s -> "T " ^ s + | Ul s -> "* " ^ s + +let gemini_to_str l = + List.fold_right (fun s -> fun r -> (gemini_line_to_str s) ^ "\n" ^ r) l "" + +let test_parse p str = + match parse_string ~consume:All (p >>= return) str with + | Ok x -> gemini_to_str x + | _ -> "" |