1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
|
module Lexer
open Types
open Shared
open LexerShared
type LexerState =
| Normal
| InCodeBlock of content: string * Language
| InHTMLTag of tag: string * depth: int
let htmlSingleton = [
"area"; "base"; "br"; "col"; "command"; "embed"; "hr"; "img"; "input"
"keygen"; "link"; "meta"; "param"; "source"; "track"; "wbr"
]
/// An escaped char tokenizer, which identifies the escaped characters and returns them
/// as a literal instead, without the leading '\'
let (|EscapedCharTok|_|) = (|EscapedChar|_|) LITERAL charList
/// Returns the Token type of the identifier token
let (|CharacterTok|_|) = (|Character|_|) charList
let (|MatchLang|_|) = function
| RegexMatch "(p|P)ython" _ -> Some Python | RegexMatch "F#|fsharp|f#" _ -> Some FSharp
| RegexMatch "(C|c)(\\+\\+|pp)" _ -> Some CPP | RegexMatch "C|c" _ -> Some C | _ -> Some Empty
let (|InList|_|) lst el =
match List.exists ((=) el) lst with
| true -> Some el
| _ -> None
let (|HTMLStartTag|_|) = (|RegexMatch|_|) "^<([a-zA-Z]+)\\s*.*?>"
let (|HTMLEndTag|_|) = (|RegexMatch|_|) "^<\\/([a-zA-Z]+)\\s*.*?>"
let (|HTMLSingleton|_|) = function
| RegexMatch "^<([a-zA-Z]+)\\s*.*?(?:\\/>|>)" (s, [InList htmlSingleton g], r) ->
Some (s, r)
| _ -> None
let (|CodeBlockStart|_|) = (|GroupMatch|_|) "^```+\\s*([a-zA-Z0-9+\\-_]*)"
/// Returns the next Token of a string
let nextToken state s =
match s, state with
| EscapedCharTok n, _ -> n, state
| HTMLSingleton (s, r), Normal ->
(LITERAL s, r), Normal
| HTMLStartTag (s, [t], r), Normal ->
(LITERAL s, r), InHTMLTag (t, 1)
| HTMLStartTag (s, [t], r), InHTMLTag (tag, d) ->
if t = tag then
(LITERAL s, r), InHTMLTag (tag, d+1)
else
(LITERAL s, r), InHTMLTag (tag, d)
| HTMLEndTag (s, [t], r), InHTMLTag (tag, d) ->
if t = tag then
if d = 1 then (LITERAL s, r), Normal
else (LITERAL s, r), InHTMLTag (tag, d-1)
else (LITERAL s, r), InHTMLTag (tag, d)
| RegexMatch "^.+?(?=<)" (s, _, r), InHTMLTag (t, d) ->
(LITERAL s, r), InHTMLTag (t, d)
| RegexMatch "^.*" (s, _, r), InHTMLTag (t, d) ->
(LITERAL s, r), InHTMLTag (t, d)
| CharacterTok n, _ -> n, state
| RegexMatch @"^\s+" (m, _, s), _ ->
(replaceChars "\t" " " m
|> String.length |> WHITESPACE, s), state
| RegexMatch "^[0-9]+" (m, _, s), _->
(NUMBER m, s), state
| RegexMatch (literalString charList) (m, _, s), _ ->
(LITERAL m, s), state
| s, _ ->
sprintf "Unrecognised character: %A" s |> sharedLog.Warn None
(toString s.[0] |> LITERAL, (sOnwards 1 s)), state
/// Lexes a whole string and returns the result as a Token list
let lexS state source =
let rec lexS' state s tokList =
match s, state with
| "", InHTMLTag _ -> tokList
| "", _ -> ENDLINE :: tokList
| _ ->
let (nt, st'), nstate = nextToken state s
nt :: tokList |> lexS' nstate st'
match source, state with
| CodeBlockStart (MatchLang lang), Normal ->
[], InCodeBlock ("", lang)
| RegexMatch "^```+" _, InCodeBlock (s, lang) ->
[CODEBLOCK (s, lang); ENDLINE], Normal
| _, InCodeBlock (s, lang) ->
[], InCodeBlock (s+source+"\n", lang)
| RegexMatch @"^\s*$" _, _ ->
[ENDLINE], state
| _ ->
lexS' state source [] |> List.rev, state
/// Return the correct token if it is not close properly at the end
let returnTokens = function
| _, InCodeBlock (s, l) ->
[CODEBLOCK (s, l); ENDLINE]
| tok, InHTMLTag (str, _) ->
tok @ [LITERAL str; ENDLINE]
| tok, _ ->
tok
/// Lex a single string
let lex s =
lexS Normal s |> returnTokens
/// Lexes a list of strings and returns the Token list
let lexList strl =
let f (flist, state) nstr =
let (lst, st) = lexS state nstr
flist @ lst, st
List.fold f ([], Normal) strl |> returnTokens
|