Safe Haskell	Safe-Inferred
Language	GHC2021

MarkupParse

Contents

Markup
Warnings
Token components
Tokens
XML specific Parsers

Description

A Markup parser and printer of strict bytestrings. Markup is a representation of data such as HTML, SVG or XML but the parsing is sub-standard.

Synopsis

data Markup = Markup {
- standard :: Standard
- markupTree :: [Tree Token]
}
data Standard
- = Html
- | Xml
markup :: Standard -> ByteString -> These [MarkupWarning] Markup
markup_ :: Standard -> ByteString -> Markup
data RenderStyle
- = Compact
- | Indented Int
markdown :: RenderStyle -> Markup -> ByteString
normalize :: Markup -> Markup
wellFormed :: Markup -> [MarkupWarning]
isWellFormed :: Markup -> Bool
data MarkupWarning
- = BadEmptyElemTag
- | SelfCloserWithChildren
- | LeafWithChildren
- | TagMismatch TagName TagName
- | UnmatchedEndTag
- | UnclosedTag
- | EndTagInTree
- | EmptyContent
- | MarkupParser ParserWarning
type Result a = These [MarkupWarning] a
resultError :: Result a -> a
resultEither :: Result a -> Either [MarkupWarning] a
resultMaybe :: Result a -> Maybe a
type TagName = ByteString
name :: Standard -> Parser e ByteString
selfClosers :: [TagName]
type AttrName = ByteString
type AttrValue = ByteString
data Attr = Attr !AttrName !AttrValue
attrs :: Standard -> Parser a [Attr]
data Token
- = StartTag !TagName ![Attr]
- | EmptyElemTag !TagName ![Attr]
- | EndTag !TagName
- | Content !ByteString
- | Comment !ByteString
- | Decl !ByteString
- | Doctype !ByteString
tokenize :: Standard -> ByteString -> These [MarkupWarning] [Token]
tokenize_ :: Standard -> ByteString -> [Token]
token :: Standard -> Parser String Token
detokenize :: Standard -> Token -> ByteString
gather :: Standard -> [Token] -> These [MarkupWarning] [Tree Token]
gather_ :: Standard -> [Token] -> [Tree Token]
degather :: Markup -> These [MarkupWarning] [Token]
degather_ :: Markup -> [Token]
xmlVersionInfo :: Parser e ByteString
xmlEncodingDecl :: Parser e ByteString
xmlStandalone :: Parser e ByteString
xmlVersionNum :: Parser e ByteString
xmlEncName :: Parser e ByteString
xmlYesNo :: Parser e ByteString

Documentation

import MarkupParse
import Data.ByteString qualified as B

bs <- B.readFile "other/line.svg"
m = markup_ bs

markdown . 'markup_ is an approximate round trip from ByteString to Markup back to ByteString'. The underscores represent versions of main functions that throw an exception on warnings encountered along the way.

At a lower level, a round trip pipeline might look something like:

:t tokenize Html >=> gather Html >>> fmap (Markup Html >>> normalize) >=> degather >>> fmap (fmap (detokenize Html) >>> mconcat)
ByteString -> These [MarkupWarning] ByteString

From left to right:

tokenize converts a ByteString to a Token list,
gather takes the tokens and gathers them into Trees of tokens
this is then wrapped into the Markup data type.
normalize concatenates content, and normalizes attributes,
degather turns the markup tree back into a token list. Finally,
detokenize turns a token back into a bytestring.

Along the way, the kleisi fishies and compose forward usage accumulates any warnings via the These monad instance.

Markup

data Markup Source #

A Tree list of markup Tokens

>>> markup Html "<foo class=\"bar\">baz</foo>"
That (Markup {standard = Html, markupTree = [Node {rootLabel = StartTag "foo" [Attr "class" "bar"], subForest = [Node {rootLabel = Content "baz", subForest = []}]}]})

Constructors

Markup
Fields standard :: Standard markupTree :: [Tree Token]

Instances

Instances details

Generic Markup Source #
Instance details Defined in MarkupParse Associated Types type Rep Markup :: Type -> Type # Methods from :: Markup -> Rep Markup x # to :: Rep Markup x -> Markup #
Show Markup Source #
Instance details Defined in MarkupParse Methods showsPrec :: Int -> Markup -> ShowS # show :: Markup -> String # showList :: [Markup] -> ShowS #
NFData Markup Source #
Instance details Defined in MarkupParse Methods rnf :: Markup -> () #
Eq Markup Source #
Instance details Defined in MarkupParse Methods (==) :: Markup -> Markup -> Bool # (/=) :: Markup -> Markup -> Bool #
Ord Markup Source #
Instance details Defined in MarkupParse Methods compare :: Markup -> Markup -> Ordering # (<) :: Markup -> Markup -> Bool # (<=) :: Markup -> Markup -> Bool # (>) :: Markup -> Markup -> Bool # (>=) :: Markup -> Markup -> Bool # max :: Markup -> Markup -> Markup # min :: Markup -> Markup -> Markup #
ToExpr Markup Source #
Instance details Defined in MarkupParse Methods toExpr :: Markup -> Expr # listToExpr :: [Markup] -> Expr #
type Rep Markup Source #
Instance details Defined in MarkupParse type Rep Markup = D1 ('MetaData "Markup" "MarkupParse" "markup-parse-0.0.0.2-HnQ2lGo9uJcI0zQ43aTniK" 'False) (C1 ('MetaCons "Markup" 'PrefixI 'True) (S1 ('MetaSel ('Just "standard") 'NoSourceUnpackedness 'NoSourceStrictness 'DecidedLazy) (Rec0 Standard) :*: S1 ('MetaSel ('Just "markupTree") 'NoSourceUnpackedness 'NoSourceStrictness 'DecidedLazy) (Rec0 [Tree Token])))

data Standard Source #

From a parsing pov, Html & Xml (& Svg) are close enough that they share a lot of parsing logic, so that parsing and printing just need some tweaking.

The xml parsing logic is based on the XML productions found in https://www.w3.org/TR/xml/

The html parsing was based on a reading of html-parse, but ignores the various 'x00' to 'xfffd' & eof directives that form part of the html standards.

Constructors

Html
Xml

Instances

Instances details

Generic Standard Source #
Instance details Defined in MarkupParse Associated Types type Rep Standard :: Type -> Type # Methods from :: Standard -> Rep Standard x # to :: Rep Standard x -> Standard #
Show Standard Source #
Instance details Defined in MarkupParse Methods showsPrec :: Int -> Standard -> ShowS # show :: Standard -> String # showList :: [Standard] -> ShowS #
NFData Standard Source #
Instance details Defined in MarkupParse Methods rnf :: Standard -> () #
Eq Standard Source #
Instance details Defined in MarkupParse Methods (==) :: Standard -> Standard -> Bool # (/=) :: Standard -> Standard -> Bool #
Ord Standard Source #
Instance details Defined in MarkupParse Methods compare :: Standard -> Standard -> Ordering # (<) :: Standard -> Standard -> Bool # (<=) :: Standard -> Standard -> Bool # (>) :: Standard -> Standard -> Bool # (>=) :: Standard -> Standard -> Bool # max :: Standard -> Standard -> Standard # min :: Standard -> Standard -> Standard #
ToExpr Standard Source #
Instance details Defined in MarkupParse Methods toExpr :: Standard -> Expr # listToExpr :: [Standard] -> Expr #
type Rep Standard Source #
Instance details Defined in MarkupParse type Rep Standard = D1 ('MetaData "Standard" "MarkupParse" "markup-parse-0.0.0.2-HnQ2lGo9uJcI0zQ43aTniK" 'False) (C1 ('MetaCons "Html" 'PrefixI 'False) (U1 :: Type -> Type) :+: C1 ('MetaCons "Xml" 'PrefixI 'False) (U1 :: Type -> Type))

markup :: Standard -> ByteString -> These [MarkupWarning] Markup Source #

Convert bytestrings to Markup

>>> markup Html "<foo><br></foo><baz"
These [MarkupParser (ParserLeftover "<baz")] (Markup {standard = Html, markupTree = [Node {rootLabel = StartTag "foo" [], subForest = [Node {rootLabel = StartTag "br" [], subForest = []}]}]})

markup_ :: Standard -> ByteString -> Markup Source #

markup but errors on warnings.

data RenderStyle Source #

Indented 0 puts newlines in between the tags.

Constructors

Compact
Indented Int

Instances

Instances details

Generic RenderStyle Source #
Instance details Defined in MarkupParse Associated Types type Rep RenderStyle :: Type -> Type # Methods from :: RenderStyle -> Rep RenderStyle x # to :: Rep RenderStyle x -> RenderStyle #
Show RenderStyle Source #
Instance details Defined in MarkupParse Methods showsPrec :: Int -> RenderStyle -> ShowS # show :: RenderStyle -> String # showList :: [RenderStyle] -> ShowS #
Eq RenderStyle Source #
Instance details Defined in MarkupParse Methods (==) :: RenderStyle -> RenderStyle -> Bool # (/=) :: RenderStyle -> RenderStyle -> Bool #
type Rep RenderStyle Source #
Instance details Defined in MarkupParse type Rep RenderStyle = D1 ('MetaData "RenderStyle" "MarkupParse" "markup-parse-0.0.0.2-HnQ2lGo9uJcI0zQ43aTniK" 'False) (C1 ('MetaCons "Compact" 'PrefixI 'False) (U1 :: Type -> Type) :+: C1 ('MetaCons "Indented" 'PrefixI 'False) (S1 ('MetaSel ('Nothing :: Maybe Symbol) 'NoSourceUnpackedness 'NoSourceStrictness 'DecidedLazy) (Rec0 Int)))

markdown :: RenderStyle -> Markup -> ByteString Source #

Convert Markup to bytestrings

>>> B.putStr $ markdown (Indented 4) (markup_ Html [i|<foo><br></foo>|])
<foo>
    <br>
</foo>

normalize :: Markup -> Markup Source #

concatenate sequential content, and normalize attributes; unwording class values and removing duplicate attributes (taking last).

>>> B.putStr $ markdown Compact $ normalize (markup_ Xml [i|<foo class="a" class="b" bar="first" bar="last"/>|])
<foo bar="last" class="a b"/>

wellFormed :: Markup -> [MarkupWarning] Source #

Check for well-formedness and rerturn warnings encountered.

>>> wellFormed $ Markup Html [Node (Comment "") [], Node (EndTag "foo") [], Node (EmptyElemTag "foo" []) [Node (Content "bar") []], Node (EmptyElemTag "foo" []) []]
[EmptyContent,EndTagInTree,LeafWithChildren,BadEmptyElemTag]

isWellFormed :: Markup -> Bool Source #

Are the trees in the markup well-formed?

Warnings

data MarkupWarning Source #

markup-parse generally tries to continue on parse errors, and return what has/can still be parsed, together with any warnings.

Constructors

BadEmptyElemTag	A tag ending with "/>" that is not an element of `selfClosers` (Html only).
SelfCloserWithChildren	A tag ending with "/>" that has children. Cannot happen in the parsing phase.
LeafWithChildren	Only a `StartTag` can have child tokens.
TagMismatch TagName TagName	A CloseTag with a different name to the currently open StartTag.
UnmatchedEndTag	An EndTag with no corresponding StartTag.
UnclosedTag	An EndTag with corresponding StartTag.
EndTagInTree	An EndTag should never appear in `Markup`
EmptyContent	Empty Content, Comment, Decl or Doctype
MarkupParser ParserWarning

Instances

Instances details

Generic MarkupWarning Source #
Instance details Defined in MarkupParse Associated Types type Rep MarkupWarning :: Type -> Type # Methods from :: MarkupWarning -> Rep MarkupWarning x # to :: Rep MarkupWarning x -> MarkupWarning #
Show MarkupWarning Source #
Instance details Defined in MarkupParse Methods showsPrec :: Int -> MarkupWarning -> ShowS # show :: MarkupWarning -> String # showList :: [MarkupWarning] -> ShowS #
NFData MarkupWarning Source #
Instance details Defined in MarkupParse Methods rnf :: MarkupWarning -> () #
Eq MarkupWarning Source #
Instance details Defined in MarkupParse Methods (==) :: MarkupWarning -> MarkupWarning -> Bool # (/=) :: MarkupWarning -> MarkupWarning -> Bool #
Ord MarkupWarning Source #
Instance details Defined in MarkupParse Methods compare :: MarkupWarning -> MarkupWarning -> Ordering # (<) :: MarkupWarning -> MarkupWarning -> Bool # (<=) :: MarkupWarning -> MarkupWarning -> Bool # (>) :: MarkupWarning -> MarkupWarning -> Bool # (>=) :: MarkupWarning -> MarkupWarning -> Bool # max :: MarkupWarning -> MarkupWarning -> MarkupWarning # min :: MarkupWarning -> MarkupWarning -> MarkupWarning #
type Rep MarkupWarning Source #
Instance details Defined in MarkupParse type Rep MarkupWarning = D1 ('MetaData "MarkupWarning" "MarkupParse" "markup-parse-0.0.0.2-HnQ2lGo9uJcI0zQ43aTniK" 'False) (((C1 ('MetaCons "BadEmptyElemTag" 'PrefixI 'False) (U1 :: Type -> Type) :+: C1 ('MetaCons "SelfCloserWithChildren" 'PrefixI 'False) (U1 :: Type -> Type)) :+: (C1 ('MetaCons "LeafWithChildren" 'PrefixI 'False) (U1 :: Type -> Type) :+: C1 ('MetaCons "TagMismatch" 'PrefixI 'False) (S1 ('MetaSel ('Nothing :: Maybe Symbol) 'NoSourceUnpackedness 'NoSourceStrictness 'DecidedLazy) (Rec0 TagName) :*: S1 ('MetaSel ('Nothing :: Maybe Symbol) 'NoSourceUnpackedness 'NoSourceStrictness 'DecidedLazy) (Rec0 TagName)))) :+: ((C1 ('MetaCons "UnmatchedEndTag" 'PrefixI 'False) (U1 :: Type -> Type) :+: C1 ('MetaCons "UnclosedTag" 'PrefixI 'False) (U1 :: Type -> Type)) :+: (C1 ('MetaCons "EndTagInTree" 'PrefixI 'False) (U1 :: Type -> Type) :+: (C1 ('MetaCons "EmptyContent" 'PrefixI 'False) (U1 :: Type -> Type) :+: C1 ('MetaCons "MarkupParser" 'PrefixI 'False) (S1 ('MetaSel ('Nothing :: Maybe Symbol) 'NoSourceUnpackedness 'NoSourceStrictness 'DecidedLazy) (Rec0 ParserWarning))))))

type Result a = These [MarkupWarning] a Source #

The structure of many returning functions.

A common computation pipeline is to take advantage of the These Monad instance eg

markup s bs = bs & (tokenize s >=> gather s) & second (Markup s)

resultError :: Result a -> a Source #

Convert any warnings to an error

>>> resultError $ (tokenize Html) "<foo"
*** Exception: MarkupParser (ParserLeftover "<foo")
...

resultEither :: Result a -> Either [MarkupWarning] a Source #

Returns Left on any warnings

>>> resultEither $ (tokenize Html) "<foo><baz"
Left [MarkupParser (ParserLeftover "<baz")]

resultMaybe :: Result a -> Maybe a Source #

Returns results if any, ignoring warnings.

>>> resultMaybe $ (tokenize Html) "<foo><baz"
Just [StartTag "foo" []]

Token components

type TagName = ByteString Source #

Name of token

name :: Standard -> Parser e ByteString Source #

Parse a tag name. Each standard is slightly different.

selfClosers :: [TagName] Source #

Html tags that self-close

type AttrName = ByteString Source #

Name of an attribute.

type AttrValue = ByteString Source #

Value of an attribute. "" is equivalent to true with respect to boolean attributes.

data Attr Source #

An attribute of a tag

In parsing, boolean attributes, which are not required to have a value in HTML, will be set a value of "", which is ok. But this will then be rendered.

>>> detokenize Html <$> tokenize_ Html [i|<input checked>|]
["<input checked=\"\">"]

Constructors

Attr !AttrName !AttrValue

Instances

Instances details

Generic Attr Source #
Instance details Defined in MarkupParse Associated Types type Rep Attr :: Type -> Type # Methods from :: Attr -> Rep Attr x # to :: Rep Attr x -> Attr #
Show Attr Source #
Instance details Defined in MarkupParse Methods showsPrec :: Int -> Attr -> ShowS # show :: Attr -> String # showList :: [Attr] -> ShowS #
NFData Attr Source #
Instance details Defined in MarkupParse Methods rnf :: Attr -> () #
Eq Attr Source #
Instance details Defined in MarkupParse Methods (==) :: Attr -> Attr -> Bool # (/=) :: Attr -> Attr -> Bool #
Ord Attr Source #
Instance details Defined in MarkupParse Methods compare :: Attr -> Attr -> Ordering # (<) :: Attr -> Attr -> Bool # (<=) :: Attr -> Attr -> Bool # (>) :: Attr -> Attr -> Bool # (>=) :: Attr -> Attr -> Bool # max :: Attr -> Attr -> Attr # min :: Attr -> Attr -> Attr #
ToExpr Attr Source #
Instance details Defined in MarkupParse Methods toExpr :: Attr -> Expr # listToExpr :: [Attr] -> Expr #
type Rep Attr Source #
Instance details Defined in MarkupParse type Rep Attr = D1 ('MetaData "Attr" "MarkupParse" "markup-parse-0.0.0.2-HnQ2lGo9uJcI0zQ43aTniK" 'False) (C1 ('MetaCons "Attr" 'PrefixI 'False) (S1 ('MetaSel ('Nothing :: Maybe Symbol) 'NoSourceUnpackedness 'SourceStrict 'DecidedStrict) (Rec0 AttrName) :*: S1 ('MetaSel ('Nothing :: Maybe Symbol) 'NoSourceUnpackedness 'SourceStrict 'DecidedStrict) (Rec0 AttrValue)))

attrs :: Standard -> Parser a [Attr] Source #

Parse attributions

Tokens

data Token Source #

A Markup token

>>> runParser_ (many (token Html)) [i|<foo>content</foo>|]
[StartTag "foo" [],Content "content",EndTag "foo"]

>>> runParser_ (token Xml) [i|<foo/>|]
EmptyElemTag "foo" []

>>> runParser_ (token Html) "<!-- Comment -->"
Comment " Comment "

>>> runParser_ (token Xml) [i|<?xml version="1.0" encoding="UTF-8"?>|]
Decl "xml version=\"1.0\" encoding=\"UTF-8\""

>>> runParser_ (token Html) "<!DOCTYPE html>"
Doctype "DOCTYPE html"

>>> runParser_ (token Xml) "<!DOCTYPE foo [ declarations ]>"
Doctype "DOCTYPE foo [ declarations ]"

>>> runParser (token Html) [i|<foo a="a" b="b" c=c check>|]
OK (StartTag "foo" [Attr "a" "a",Attr "b" "b",Attr "c" "c",Attr "check" ""]) ""

>>> runParser (token Xml) [i|<foo a="a" b="b" c=c check>|]
Fail

Constructors

StartTag !TagName ![Attr]	A start tag. https://developer.mozilla.org/en-US/docs/Glossary/Tag
EmptyElemTag !TagName ![Attr]	An empty element tag. Optional for XML and kind of not allowed in HTML.
EndTag !TagName	A closing tag.
Content !ByteString	The content between tags.
Comment !ByteString	Contents of a comment.
Decl !ByteString	Contents of a declaration
Doctype !ByteString	Contents of a doctype declaration.

Instances

Instances details

Generic Token Source #
Instance details Defined in MarkupParse Associated Types type Rep Token :: Type -> Type # Methods from :: Token -> Rep Token x # to :: Rep Token x -> Token #
Show Token Source #
Instance details Defined in MarkupParse Methods showsPrec :: Int -> Token -> ShowS # show :: Token -> String # showList :: [Token] -> ShowS #
NFData Token Source #
Instance details Defined in MarkupParse Methods rnf :: Token -> () #
Eq Token Source #
Instance details Defined in MarkupParse Methods (==) :: Token -> Token -> Bool # (/=) :: Token -> Token -> Bool #
Ord Token Source #
Instance details Defined in MarkupParse Methods compare :: Token -> Token -> Ordering # (<) :: Token -> Token -> Bool # (<=) :: Token -> Token -> Bool # (>) :: Token -> Token -> Bool # (>=) :: Token -> Token -> Bool # max :: Token -> Token -> Token # min :: Token -> Token -> Token #
ToExpr Token Source #
Instance details Defined in MarkupParse Methods toExpr :: Token -> Expr # listToExpr :: [Token] -> Expr #
type Rep Token Source #
Instance details Defined in MarkupParse type Rep Token = D1 ('MetaData "Token" "MarkupParse" "markup-parse-0.0.0.2-HnQ2lGo9uJcI0zQ43aTniK" 'False) ((C1 ('MetaCons "StartTag" 'PrefixI 'False) (S1 ('MetaSel ('Nothing :: Maybe Symbol) 'NoSourceUnpackedness 'SourceStrict 'DecidedStrict) (Rec0 TagName) :: S1 ('MetaSel ('Nothing :: Maybe Symbol) 'NoSourceUnpackedness 'SourceStrict 'DecidedStrict) (Rec0 [Attr])) :+: (C1 ('MetaCons "EmptyElemTag" 'PrefixI 'False) (S1 ('MetaSel ('Nothing :: Maybe Symbol) 'NoSourceUnpackedness 'SourceStrict 'DecidedStrict) (Rec0 TagName) :: S1 ('MetaSel ('Nothing :: Maybe Symbol) 'NoSourceUnpackedness 'SourceStrict 'DecidedStrict) (Rec0 [Attr])) :+: C1 ('MetaCons "EndTag" 'PrefixI 'False) (S1 ('MetaSel ('Nothing :: Maybe Symbol) 'NoSourceUnpackedness 'SourceStrict 'DecidedStrict) (Rec0 TagName)))) :+: ((C1 ('MetaCons "Content" 'PrefixI 'False) (S1 ('MetaSel ('Nothing :: Maybe Symbol) 'NoSourceUnpackedness 'SourceStrict 'DecidedStrict) (Rec0 ByteString)) :+: C1 ('MetaCons "Comment" 'PrefixI 'False) (S1 ('MetaSel ('Nothing :: Maybe Symbol) 'NoSourceUnpackedness 'SourceStrict 'DecidedStrict) (Rec0 ByteString))) :+: (C1 ('MetaCons "Decl" 'PrefixI 'False) (S1 ('MetaSel ('Nothing :: Maybe Symbol) 'NoSourceUnpackedness 'SourceStrict 'DecidedStrict) (Rec0 ByteString)) :+: C1 ('MetaCons "Doctype" 'PrefixI 'False) (S1 ('MetaSel ('Nothing :: Maybe Symbol) 'NoSourceUnpackedness 'SourceStrict 'DecidedStrict) (Rec0 ByteString)))))

tokenize :: Standard -> ByteString -> These [MarkupWarning] [Token] Source #

Parse a bytestring into tokens

>>> tokenize Html [i|<foo>content</foo>|]
That [StartTag "foo" [],Content "content",EndTag "foo"]

tokenize_ :: Standard -> ByteString -> [Token] Source #

tokenize but errors on warnings.

token :: Standard -> Parser String Token Source #

A flatparse Token parser.

>>> runParser (token Html) "<foo>content</foo>"
OK (StartTag "foo" []) "content</foo>"

detokenize :: Standard -> Token -> ByteString Source #

bytestring representation of Token.

>>> detokenize Html (StartTag "foo" [])
"<foo>"

gather :: Standard -> [Token] -> These [MarkupWarning] [Tree Token] Source #

Gather together token trees from a token list, placing child elements in nodes and removing EndTags.

>>> gather Html =<< tokenize Html "<foo class=\"bar\">baz</foo>"
That [Node {rootLabel = StartTag "foo" [Attr "class" "bar"], subForest = [Node {rootLabel = Content "baz", subForest = []}]}]

gather_ :: Standard -> [Token] -> [Tree Token] Source #

gather but errors on warnings.

degather :: Markup -> These [MarkupWarning] [Token] Source #

Convert a markup into a token list, adding end tags.

>>> degather =<< markup Html "<foo class=\"bar\">baz</foo>"
That [StartTag "foo" [Attr "class" "bar"],Content "baz",EndTag "foo"]

degather_ :: Markup -> [Token] Source #

degather but errors on warning

XML specific Parsers

xmlVersionInfo :: Parser e ByteString Source #

xml production [24]

xmlEncodingDecl :: Parser e ByteString Source #

xml production [80]

xmlStandalone :: Parser e ByteString Source #

Xml production [32]

xmlVersionNum :: Parser e ByteString Source #

xml production [26]

xmlEncName :: Parser e ByteString Source #

xml production [81]

xmlYesNo :: Parser e ByteString Source #

Xml yes/no