aboutsummaryrefslogtreecommitdiffstats
path: root/FMark/src/Common/Lexer/Lexer.fs
blob: a7bd1116091e067f3ad8c14cedcbc8f0363ac9ce (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
module Lexer

open Types
open Shared
open LexerShared

type LexerState =
    | Normal
    | InCodeBlock of content: string * Language
    | InHTMLTag of tag: string * depth: int

let htmlSingleton = [
    "area"; "base"; "br"; "col"; "command"; "embed"; "hr"; "img"; "input"
    "keygen"; "link"; "meta"; "param"; "source"; "track"; "wbr"
]

/// An escaped char tokenizer, which identifies the escaped characters and returns them
/// as a literal instead, without the leading '\'
let (|EscapedCharTok|_|) = (|EscapedChar|_|) LITERAL charList

/// Returns the Token type of the identifier token
let (|CharacterTok|_|) = (|Character|_|) charList

let (|MatchLang|_|) = function
    | RegexMatch "(p|P)ython" _ -> Some Python | RegexMatch "F#|fsharp|f#" _ -> Some FSharp
    | RegexMatch "(C|c)(\\+\\+|pp)" _ -> Some CPP | RegexMatch "C|c" _ -> Some C | _ -> Some Empty

let (|InList|_|) lst el =
    match List.exists ((=) el) lst with
    | true -> Some el
    | _ -> None

let (|HTMLStartTag|_|) = (|RegexMatch|_|) "^<([a-zA-Z]+)\\s*.*?>"



let (|HTMLEndTag|_|) = (|RegexMatch|_|) "^<\\/([a-zA-Z]+)\\s*.*?>"

let (|HTMLSingleton|_|) = function
    | RegexMatch "^<([a-zA-Z]+)\\s*.*?(?:\\/>|>)" (s, [InList htmlSingleton g], r) ->
        Some (s, r)
    | _ -> None

let (|CodeBlockStart|_|) = (|GroupMatch|_|) "^```+\\s*([a-zA-Z0-9+\\-_]*)"

/// Returns the next Token of a string
let nextToken state s =
    match s, state with
    | EscapedCharTok n, _ -> n, state
    | HTMLSingleton (s, r), Normal ->
        (LITERAL s, r), Normal
    | HTMLStartTag (s, [t], r), Normal ->
        (LITERAL s, r), InHTMLTag (t, 1)
    | HTMLStartTag (s, [t], r), InHTMLTag (tag, d) ->
        if t = tag then
            (LITERAL s, r), InHTMLTag (tag, d+1)
        else
            (LITERAL s, r), InHTMLTag (tag, d)
    | HTMLEndTag (s, [t], r), InHTMLTag (tag, d) ->
        if t = tag then
            if d = 1 then (LITERAL s, r), Normal
            else (LITERAL s, r), InHTMLTag (tag, d-1)
        else (LITERAL s, r), InHTMLTag (tag, d)
    | RegexMatch "^.+?(?=<)" (s, _, r), InHTMLTag (t, d) ->
        (LITERAL s, r), InHTMLTag (t, d)
    | RegexMatch "^.*" (s, _, r), InHTMLTag (t, d) ->
        (LITERAL s, r), InHTMLTag (t, d)
    | CharacterTok n, _ -> n, state
    | RegexMatch @"^\s+" (m, _, s), _ ->
        (replaceChars "\t" "  " m 
        |> String.length |> WHITESPACE, s), state
    | RegexMatch "^[0-9]+" (m, _, s), _->
        (NUMBER m, s), state
    | RegexMatch (literalString charList) (m, _, s), _ ->
        (LITERAL m, s), state
    | s, _ ->
        sprintf "Unrecognised character: %A" s |> sharedLog.Warn None
        (toString s.[0] |> LITERAL, (sOnwards 1 s)), state

/// Lexes a whole string and returns the result as a Token list
let lexS state source =
    let rec lexS' state s tokList =
        match s, state with
        | "", InHTMLTag _ -> tokList
        | "", _ -> ENDLINE :: tokList
        | _ ->
            let (nt, st'), nstate = nextToken state s
            nt :: tokList |> lexS' nstate st'
    match source, state with
    | CodeBlockStart (MatchLang lang), Normal ->
        [], InCodeBlock ("", lang)
    | RegexMatch "^```+" _, InCodeBlock (s, lang) ->
        [CODEBLOCK (s, lang); ENDLINE], Normal
    | _, InCodeBlock (s, lang) ->
            [], InCodeBlock (s+source+"\n", lang)
    | RegexMatch @"^\s*$" _, _ ->
        [ENDLINE], state
    | _ ->
        lexS' state source [] |> List.rev, state

/// Return the correct token if it is not close properly at the end
let returnTokens = function
    | _, InCodeBlock (s, l) ->
        [CODEBLOCK (s, l); ENDLINE]
    | tok, InHTMLTag (str, _) ->
        tok @ [LITERAL str; ENDLINE]
    | tok, _ ->
        tok

/// Lex a single string
let lex s =
    lexS Normal s |> returnTokens

/// Lexes a list of strings and returns the Token list
let lexList strl =
    let f (flist, state) nstr =
        let (lst, st) = lexS state nstr
        flist @ lst, st
    List.fold f ([], Normal) strl |> returnTokens