Safe Haskell | Safe-Inferred |
---|---|
Language | GHC2021 |
MarkupParse
Description
A Markup
parser and printer of strict ByteString
s focused on optimising performance. Markup
is a representation of data such as HTML, SVG or XML but the parsing is not always at standards.
Synopsis
- newtype Markup = Markup {}
- data Standard
- markup :: Standard -> ByteString -> Warn Markup
- markup_ :: Standard -> ByteString -> Markup
- data RenderStyle
- markdown :: RenderStyle -> Standard -> Markup -> Warn ByteString
- markdown_ :: RenderStyle -> Standard -> Markup -> ByteString
- normalize :: Markup -> Markup
- normContent :: Markup -> Markup
- wellFormed :: Standard -> Markup -> [MarkupWarning]
- isWellFormed :: Standard -> Markup -> Bool
- data MarkupWarning
- type Warn a = These [MarkupWarning] a
- warnError :: Warn a -> a
- warnEither :: Warn a -> Either [MarkupWarning] a
- warnMaybe :: Warn a -> Maybe a
- type Element = Tree Token
- element :: NameTag -> [Attr] -> Markup -> Markup
- element_ :: NameTag -> [Attr] -> Markup
- emptyElem :: NameTag -> [Attr] -> Markup
- elementc :: NameTag -> [Attr] -> ByteString -> Markup
- content :: ByteString -> Markup
- contentRaw :: ByteString -> Markup
- type NameTag = ByteString
- selfClosers :: [NameTag]
- addAttrs :: [Attr] -> Token -> Maybe Token
- doctypeHtml :: Markup
- doctypeXml :: Markup
- type AttrName = ByteString
- type AttrValue = ByteString
- data Attr = Attr {}
- attrsP :: Standard -> Parser a [Attr]
- nameP :: Standard -> Parser e ByteString
- data OpenTagType
- data Token
- = OpenTag !OpenTagType !NameTag ![Attr]
- | EndTag !NameTag
- | Content !ByteString
- | Comment !ByteString
- | Decl !ByteString ![Attr]
- | Doctype !ByteString
- tokenize :: Standard -> ByteString -> Warn [Token]
- tokenize_ :: Standard -> ByteString -> [Token]
- tokenP :: Standard -> Parser e Token
- detokenize :: Standard -> Token -> ByteString
- gather :: Standard -> [Token] -> Warn Markup
- gather_ :: Standard -> [Token] -> Markup
- degather :: Standard -> Markup -> Warn [Token]
- degather_ :: Standard -> Markup -> [Token]
- xmlVersionInfoP :: Parser e ByteString
- xmlEncodingDeclP :: Parser e ByteString
- xmlStandaloneP :: Parser e ByteString
- xmlVersionNumP :: Parser e ByteString
- xmlEncNameP :: Parser e ByteString
- xmlYesNoP :: Parser e ByteString
- utf8ToStr :: ByteString -> String
- strToUtf8 :: String -> ByteString
- escapeChar :: Char -> ByteString
- escape :: ByteString -> ByteString
- data Tree a = Node {}
Usage
import MarkupParse import Data.ByteString qualified as B bs <- B.readFile "other/line.svg" m = markup_ bs
is an isomorphic round trip from markdown_
. markup_
Markup
to ByteString
to Markup
:
- This is subject to the Markup being
wellFormed
. - The round-trip
is not isomorphic as parsing forgets whitespace within tags, comments and declarations.markup_
.markdown_
- The underscores represent versions of main functions that throw an exception on warnings encountered along the way.
At a lower level, a round trip pipeline might look something like:
tokenize Html >=>
tokenize
converts aByteString
to aToken
list.
gather Html >=>
(normalize >>> pure) >=>
normalize
concatenates content, and normalizes attributes,
degather >=>
degather
turns the markup tree back into a token list. Finally,
fmap (detokenize Html) >>> pure
detokenize
turns a token back into a bytestring.
Along the way, the kleisi fishies and compose forward usage accumulates any warnings via the These
monad instance.
Markup
A list of Element
s or Tree
Token
s
>>>
markup Html "<foo class=\"bar\">baz</foo>"
That (Markup {elements = [Node {rootLabel = OpenTag StartTag "foo" [Attr {attrName = "class", attrValue = "bar"}], subForest = [Node {rootLabel = Content "baz", subForest = []}]}]})
From a parsing pov, Html & Xml (& Svg) are close enough that they share a lot of parsing logic, so that parsing and printing just need some tweaking.
The xml parsing logic is based on the XML productions found in https://www.w3.org/TR/xml/
The html parsing was based on a reading of html-parse, but ignores the various 'x00' to 'xfffd' & eof directives that form part of the html standards.
markup :: Standard -> ByteString -> Warn Markup Source #
Convert bytestrings to Markup
>>>
markup Html "<foo><br></foo><baz"
These [MarkupParser (ParserLeftover "<baz")] (Markup {elements = [Node {rootLabel = OpenTag StartTag "foo" [], subForest = [Node {rootLabel = OpenTag StartTag "br" [], subForest = []}]}]})
data RenderStyle Source #
Indented 0 puts newlines in between the tags.
Instances
Generic RenderStyle Source # | |
Defined in MarkupParse Associated Types type Rep RenderStyle :: Type -> Type # | |
Show RenderStyle Source # | |
Defined in MarkupParse Methods showsPrec :: Int -> RenderStyle -> ShowS # show :: RenderStyle -> String # showList :: [RenderStyle] -> ShowS # | |
Eq RenderStyle Source # | |
Defined in MarkupParse | |
type Rep RenderStyle Source # | |
Defined in MarkupParse type Rep RenderStyle = D1 ('MetaData "RenderStyle" "MarkupParse" "markup-parse-0.1.0.0-EUIevmPWNIb1WdbRKKaIpt" 'False) (C1 ('MetaCons "Compact" 'PrefixI 'False) (U1 :: Type -> Type) :+: C1 ('MetaCons "Indented" 'PrefixI 'False) (S1 ('MetaSel ('Nothing :: Maybe Symbol) 'NoSourceUnpackedness 'NoSourceStrictness 'DecidedLazy) (Rec0 Int))) |
markdown :: RenderStyle -> Standard -> Markup -> Warn ByteString Source #
Convert Markup
to bytestrings
>>>
markdown (Indented 4) Html (markup_ Html [i|<foo><br></foo>|])
That "<foo>\n <br>\n</foo>"
markdown_ :: RenderStyle -> Standard -> Markup -> ByteString Source #
Convert Markup
to ByteString
and error on warnings.
>>>
B.putStr $ markdown_ (Indented 4) Html (markup_ Html [i|<foo><br></foo>|])
<foo> <br> </foo>
normalize :: Markup -> Markup Source #
Concatenate sequential content and normalize attributes; unwording class values and removing duplicate attributes (taking last).
>>>
B.putStr $ warnError $ markdown Compact Xml $ normalize (markup_ Xml [i|<foo class="a" class="b" bar="first" bar="last"/>|])
<foo bar="last" class="a b"/>
normContent :: Markup -> Markup Source #
Normalise Content in Markup, concatenating adjacent Content, and removing mempty Content.
>>>
normContent $ content "a" <> content "" <> content "b"
Markup {elements = [Node {rootLabel = Content "ab", subForest = []}]}
wellFormed :: Standard -> Markup -> [MarkupWarning] Source #
Check for well-formedness and return warnings encountered.
>>>
wellFormed Html $ Markup [Node (Comment "") [], Node (EndTag "foo") [], Node (OpenTag EmptyElemTag "foo" []) [Node (Content "bar") []], Node (OpenTag EmptyElemTag "foo" []) []]
[EmptyContent,EndTagInTree,LeafWithChildren,BadEmptyElemTag]
Warnings
data MarkupWarning Source #
markup-parse generally tries to continue on parse errors, and return what has/can still be parsed, together with any warnings.
Constructors
BadEmptyElemTag | A tag ending with "/>" that is not an element of |
SelfCloserWithChildren | A tag ending with "/>" that has children. Cannot happen in the parsing phase. |
LeafWithChildren | Only a |
TagMismatch NameTag NameTag | A CloseTag with a different name to the currently open StartTag. |
UnmatchedEndTag | An EndTag with no corresponding StartTag. |
UnclosedTag | An EndTag with corresponding StartTag. |
EndTagInTree | An EndTag should never appear in |
EmptyContent | Empty Content, Comment, Decl or Doctype |
BadDecl | Badly formed declaration |
MarkupParser ParserWarning |
Instances
type Warn a = These [MarkupWarning] a Source #
A type synonym for the common returning type of many functions. A common computation pipeline is to take advantage of the These
Monad instance eg
markup s bs = bs & (tokenize s >=> gather s) & second (Markup s)
warnError :: Warn a -> a Source #
Convert any warnings to an error
>>>
warnError $ (tokenize Html) "<foo"
*** Exception: MarkupParser (ParserLeftover "<foo") ...
warnEither :: Warn a -> Either [MarkupWarning] a Source #
Returns Left on any warnings
>>>
warnEither $ (tokenize Html) "<foo><baz"
Left [MarkupParser (ParserLeftover "<baz")]
warnMaybe :: Warn a -> Maybe a Source #
Returns results, if any, ignoring warnings.
>>>
warnMaybe $ (tokenize Html) "<foo><baz"
Just [OpenTag StartTag "foo" []]
element creation
element :: NameTag -> [Attr] -> Markup -> Markup Source #
Create a Markup element from a NameTag and attributes that wraps some other Markup.
>>>
element "div" [] (element_ "br" [])
Markup {elements = [Node {rootLabel = OpenTag StartTag "div" [], subForest = [Node {rootLabel = OpenTag StartTag "br" [], subForest = []}]}]}
element_ :: NameTag -> [Attr] -> Markup Source #
Create a Markup element from a NameTag and attributes that doesn't wrap some other Markup. OpenTagType is StartTag. Use emptyElem
if you want to create a EmptyElemTag.
>>>
(element_ "br" [])
Markup {elements = [Node {rootLabel = OpenTag StartTag "br" [], subForest = []}]}
emptyElem :: NameTag -> [Attr] -> Markup Source #
Create a Markup element from a NameTag and attributes using EmptyElemTag, that doesn't wrap some other Markup. No checks are made on whether this creates well-formed Markup.
>>>
emptyElem "br" []
Markup {elements = [Node {rootLabel = OpenTag EmptyElemTag "br" [], subForest = []}]}
elementc :: NameTag -> [Attr] -> ByteString -> Markup Source #
Create a Markup element from a NameTag and attributes that wraps some Content
.
>>>
elementc "div" [] "content"
Markup {elements = [Node {rootLabel = OpenTag StartTag "div" [], subForest = [Node {rootLabel = Content "content", subForest = []}]}]}
content :: ByteString -> Markup Source #
Create a Markup element from a bytestring, escaping the usual characters.
>>>
content "<content>"
Markup {elements = [Node {rootLabel = Content "<content>", subForest = []}]}
contentRaw :: ByteString -> Markup Source #
Create a Markup element from a bytestring, not escaping the usual characters.
>>>
contentRaw "<content>"
Markup {elements = [Node {rootLabel = Content "<content>", subForest = []}]}
>>>
markup_ Html $ markdown_ Compact Html $ contentRaw "<content>"
*** Exception: UnclosedTag ...
Token components
type NameTag = ByteString Source #
Name of token
selfClosers :: [NameTag] Source #
Html tags that self-close
addAttrs :: [Attr] -> Token -> Maybe Token Source #
Append attributes to the existing Token attribute list.
doctypeHtml :: Markup Source #
Standard Html Doctype
>>>
markdown_ Compact Html doctypeHtml
"<!DOCTYPE html>"
doctypeXml :: Markup Source #
Standard Xml Doctype
>>>
markdown_ Compact Xml doctypeXml
"<?xml version=\"1.0\" encoding=\"utf-8\"?><!DOCTYPE svg PUBLIC \"-//W3C//DTD SVG 1.1//EN\"\n \"http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd\">"
type AttrName = ByteString Source #
Name of an attribute.
type AttrValue = ByteString Source #
Value of an attribute. "" is equivalent to true with respect to boolean attributes.
An attribute of a tag
In parsing, boolean attributes, which are not required to have a value in HTML, will be set a value of "", which is ok. But this will then be rendered.
>>>
detokenize Html <$> tokenize_ Html [i|<input checked>|]
["<input checked=\"\">"]
Instances
Generic Attr Source # | |
Show Attr Source # | |
NFData Attr Source # | |
Defined in MarkupParse | |
Eq Attr Source # | |
Ord Attr Source # | |
ToExpr Attr Source # | |
Defined in MarkupParse | |
type Rep Attr Source # | |
Defined in MarkupParse type Rep Attr = D1 ('MetaData "Attr" "MarkupParse" "markup-parse-0.1.0.0-EUIevmPWNIb1WdbRKKaIpt" 'False) (C1 ('MetaCons "Attr" 'PrefixI 'True) (S1 ('MetaSel ('Just "attrName") 'NoSourceUnpackedness 'SourceStrict 'DecidedStrict) (Rec0 AttrName) :*: S1 ('MetaSel ('Just "attrValue") 'NoSourceUnpackedness 'SourceStrict 'DecidedStrict) (Rec0 AttrValue))) |
nameP :: Standard -> Parser e ByteString Source #
Parse a tag name. Each standard is slightly different.
Tokens
data OpenTagType Source #
Whether an opening tag is a start tag or an empty element tag.
Constructors
StartTag | |
EmptyElemTag |
Instances
Generic OpenTagType Source # | |
Defined in MarkupParse Associated Types type Rep OpenTagType :: Type -> Type # | |
Show OpenTagType Source # | |
Defined in MarkupParse Methods showsPrec :: Int -> OpenTagType -> ShowS # show :: OpenTagType -> String # showList :: [OpenTagType] -> ShowS # | |
NFData OpenTagType Source # | |
Defined in MarkupParse Methods rnf :: OpenTagType -> () # | |
Eq OpenTagType Source # | |
Defined in MarkupParse | |
Ord OpenTagType Source # | |
Defined in MarkupParse Methods compare :: OpenTagType -> OpenTagType -> Ordering # (<) :: OpenTagType -> OpenTagType -> Bool # (<=) :: OpenTagType -> OpenTagType -> Bool # (>) :: OpenTagType -> OpenTagType -> Bool # (>=) :: OpenTagType -> OpenTagType -> Bool # max :: OpenTagType -> OpenTagType -> OpenTagType # min :: OpenTagType -> OpenTagType -> OpenTagType # | |
ToExpr OpenTagType Source # | |
Defined in MarkupParse | |
type Rep OpenTagType Source # | |
A Markup token
>>>
runParser_ (many (tokenP Html)) [i|<foo>content</foo>|]
[OpenTag StartTag "foo" [],Content "content",EndTag "foo"]
>>>
runParser_ (tokenP Xml) [i|<foo/>|]
OpenTag EmptyElemTag "foo" []
>>>
runParser_ (tokenP Html) "<!-- Comment -->"
Comment " Comment "
>>>
runParser_ (tokenP Xml) [i|<?xml version="1.0" encoding="UTF-8"?>|]
Decl "xml" [Attr {attrName = "version", attrValue = " version=\"1.0\""},Attr {attrName = "encoding", attrValue = "UTF-8"}]
>>>
runParser_ (tokenP Html) "<!DOCTYPE html>"
Doctype "DOCTYPE html"
>>>
runParser_ (tokenP Xml) "<!DOCTYPE foo [ declarations ]>"
Doctype "DOCTYPE foo [ declarations ]"
>>>
runParser (tokenP Html) [i|<foo a="a" b="b" c=c check>|]
OK (OpenTag StartTag "foo" [Attr {attrName = "a", attrValue = "a"},Attr {attrName = "b", attrValue = "b"},Attr {attrName = "c", attrValue = "c"},Attr {attrName = "check", attrValue = ""}]) ""
>>>
runParser (tokenP Xml) [i|<foo a="a" b="b" c=c check>|]
Fail
Constructors
OpenTag !OpenTagType !NameTag ![Attr] | A tag. https://developer.mozilla.org/en-US/docs/Glossary/Tag |
EndTag !NameTag | A closing tag. |
Content !ByteString | The content between tags. |
Comment !ByteString | Contents of a comment. |
Decl !ByteString ![Attr] | Contents of a declaration |
Doctype !ByteString | Contents of a doctype declaration. |
Instances
tokenize :: Standard -> ByteString -> Warn [Token] Source #
Parse a bytestring into tokens
>>>
tokenize Html [i|<foo>content</foo>|]
That [OpenTag StartTag "foo" [],Content "content",EndTag "foo"]
tokenP :: Standard -> Parser e Token Source #
A flatparse Token
parser.
>>>
runParser (tokenP Html) "<foo>content</foo>"
OK (OpenTag StartTag "foo" []) "content</foo>"
detokenize :: Standard -> Token -> ByteString Source #
bytestring representation of Token
.
>>>
detokenize Html (OpenTag StartTag "foo" [])
"<foo>"
gather :: Standard -> [Token] -> Warn Markup Source #
Gather together token trees from a token list, placing child elements in nodes and removing EndTags.
>>>
gather Html =<< tokenize Html "<foo class=\"bar\">baz</foo>"
That (Markup {elements = [Node {rootLabel = OpenTag StartTag "foo" [Attr {attrName = "class", attrValue = "bar"}], subForest = [Node {rootLabel = Content "baz", subForest = []}]}]})
degather :: Standard -> Markup -> Warn [Token] Source #
Convert a markup into a token list, adding end tags.
>>>
degather Html =<< markup Html "<foo class=\"bar\">baz</foo>"
That [OpenTag StartTag "foo" [Attr {attrName = "class", attrValue = "bar"}],Content "baz",EndTag "foo"]
XML specific Parsers
xmlVersionInfoP :: Parser e ByteString Source #
xml production [24]
xmlEncodingDeclP :: Parser e ByteString Source #
xml production [80]
xmlStandaloneP :: Parser e ByteString Source #
Xml production [32]
xmlVersionNumP :: Parser e ByteString Source #
xml production [26]
xmlEncNameP :: Parser e ByteString Source #
xml production [81]
xmlYesNoP :: Parser e ByteString Source #
Xml yes/no
bytestring support
utf8ToStr :: ByteString -> String #
Convert a ByteString
to an UTF8-encoded String
.
strToUtf8 :: String -> ByteString #
Convert an UTF8-encoded String
to a ByteString
.
escapeChar :: Char -> ByteString Source #
Escape a single character.
escape :: ByteString -> ByteString Source #
Escape Content
>>>
escape [i|<foo class="a" bar='b'>|]
"<foo class="a" bar=&aposb&apos>"
Tree support
Non-empty, possibly infinite, multi-way trees; also known as rose trees.
Instances
MonadFix Tree | Since: containers-0.5.11 |
MonadZip Tree | |
Foldable Tree | |
Defined in Data.Tree Methods fold :: Monoid m => Tree m -> m # foldMap :: Monoid m => (a -> m) -> Tree a -> m # foldMap' :: Monoid m => (a -> m) -> Tree a -> m # foldr :: (a -> b -> b) -> b -> Tree a -> b # foldr' :: (a -> b -> b) -> b -> Tree a -> b # foldl :: (b -> a -> b) -> b -> Tree a -> b # foldl' :: (b -> a -> b) -> b -> Tree a -> b # foldr1 :: (a -> a -> a) -> Tree a -> a # foldl1 :: (a -> a -> a) -> Tree a -> a # elem :: Eq a => a -> Tree a -> Bool # maximum :: Ord a => Tree a -> a # | |
Eq1 Tree | Since: containers-0.5.9 |
Ord1 Tree | Since: containers-0.5.9 |
Read1 Tree | Since: containers-0.5.9 |
Defined in Data.Tree | |
Show1 Tree | Since: containers-0.5.9 |
Traversable Tree | |
Applicative Tree | |
Functor Tree | |
Monad Tree | |
Hashable1 Tree | Since: hashable-1.3.4.0 |
Defined in Data.Hashable.Class | |
Generic1 Tree | |
Data a => Data (Tree a) | |
Defined in Data.Tree Methods gfoldl :: (forall d b. Data d => c (d -> b) -> d -> c b) -> (forall g. g -> c g) -> Tree a -> c (Tree a) # gunfold :: (forall b r. Data b => c (b -> r) -> c r) -> (forall r. r -> c r) -> Constr -> c (Tree a) # toConstr :: Tree a -> Constr # dataTypeOf :: Tree a -> DataType # dataCast1 :: Typeable t => (forall d. Data d => c (t d)) -> Maybe (c (Tree a)) # dataCast2 :: Typeable t => (forall d e. (Data d, Data e) => c (t d e)) -> Maybe (c (Tree a)) # gmapT :: (forall b. Data b => b -> b) -> Tree a -> Tree a # gmapQl :: (r -> r' -> r) -> r -> (forall d. Data d => d -> r') -> Tree a -> r # gmapQr :: forall r r'. (r' -> r -> r) -> r -> (forall d. Data d => d -> r') -> Tree a -> r # gmapQ :: (forall d. Data d => d -> u) -> Tree a -> [u] # gmapQi :: Int -> (forall d. Data d => d -> u) -> Tree a -> u # gmapM :: Monad m => (forall d. Data d => d -> m d) -> Tree a -> m (Tree a) # gmapMp :: MonadPlus m => (forall d. Data d => d -> m d) -> Tree a -> m (Tree a) # gmapMo :: MonadPlus m => (forall d. Data d => d -> m d) -> Tree a -> m (Tree a) # | |
Generic (Tree a) | |
Read a => Read (Tree a) | |
Show a => Show (Tree a) | |
NFData a => NFData (Tree a) | |
Eq a => Eq (Tree a) | |
Ord a => Ord (Tree a) | Since: containers-0.6.5 |
Hashable v => Hashable (Tree v) | Since: hashable-1.3.4.0 |
Defined in Data.Hashable.Class | |
ToExpr a => ToExpr (Tree a) | |
Defined in Data.TreeDiff.Class | |
type Rep1 Tree | Since: containers-0.5.8 |
Defined in Data.Tree type Rep1 Tree = D1 ('MetaData "Tree" "Data.Tree" "containers-0.6.5.1" 'False) (C1 ('MetaCons "Node" 'PrefixI 'True) (S1 ('MetaSel ('Just "rootLabel") 'NoSourceUnpackedness 'NoSourceStrictness 'DecidedLazy) Par1 :*: S1 ('MetaSel ('Just "subForest") 'NoSourceUnpackedness 'NoSourceStrictness 'DecidedLazy) ([] :.: Rec1 Tree))) | |
type Rep (Tree a) | Since: containers-0.5.8 |
Defined in Data.Tree type Rep (Tree a) = D1 ('MetaData "Tree" "Data.Tree" "containers-0.6.5.1" 'False) (C1 ('MetaCons "Node" 'PrefixI 'True) (S1 ('MetaSel ('Just "rootLabel") 'NoSourceUnpackedness 'NoSourceStrictness 'DecidedLazy) (Rec0 a) :*: S1 ('MetaSel ('Just "subForest") 'NoSourceUnpackedness 'NoSourceStrictness 'DecidedLazy) (Rec0 [Tree a]))) |