-
Notifications
You must be signed in to change notification settings - Fork 0
/
lexical_analysis.py
116 lines (98 loc) · 3.72 KB
/
lexical_analysis.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
import re
import json
from config_file import configFile
"""
Keywords
Identifiers
Literals
Operators
Seperators
Comments
Whitespaces
"""
with open("config/java.json", "r") as f:
data = json.loads(f.read())
KEYWORDS = data["keywords"]
INDENTIFIER = r"[a-zA-Z_][a-zA-Z0-9_]*"
OPERATORS = data["operators"]
SEPERATORS = data["seperators"]
STRING_CHAR = "\"|'"
COMMENTS = data["comments"]
class LexicalAnalysis:
def __init__(self, code: str, config: configFile) -> None:
self.code = code
self.config = config
self.tokens = []
def preprocess(self, between_lines: tuple, code: str = None) -> str:
if code is None:
code = self.code
code = removeBlockComments(code).splitlines()
between_lines = (
between_lines[0] if len(between_lines) > 0 else 1,
between_lines[1] if len(between_lines) > 1 else len(code),
)
code = code[between_lines[0] - 1 : between_lines[1]]
for i, line in enumerate(code):
code[i] = removeComments(line, COMMENTS["single"])
return "\n".join(filter(None, code))
def tokenisation(self, cleaned_code: str, config: configFile) -> list:
tokens = []
is_string = False
split_pattern = createSeperatorRegex()
create_string = []
for word in re.split(split_pattern, cleaned_code):
if not word:
continue
if is_string:
create_string.append(word)
elif re.match("\s+", word):
tokens.append(("whitespace", word))
continue
quoted_count = len(re.findall(STRING_CHAR, word))
for _ in range(0, quoted_count):
is_string = not is_string
if is_string:
create_string.append(word)
elif create_string:
tokens.append((config.literal.string, "".join(create_string)))
create_string = []
is_string = False
if quoted_count > 0:
continue
if word in KEYWORDS:
tokens.append((config.keyword, word))
elif word in SEPERATORS:
tokens.append((config.seperator, word))
elif word in OPERATORS:
tokens.append((config.operator, word))
else:
identifier = True if re.fullmatch(INDENTIFIER, word) else False
if identifier and not is_string:
tokens.append((config.identifier, word))
elif not is_string:
if word.isdigit():
tokens.append((config.literal.number, word))
else:
for char in word:
if char.isdigit():
tokens.append((config.literal.number, char))
else:
tokens.append((config.operator, char))
return tokens
# Need to update to work for different block comments
def removeBlockComments(code: str) -> str:
return re.sub("\/\*([\s\S]*?)\*\/", "", code)
def removeComments(code: str, comment_type: str) -> str:
no_string = removeString(code)
comment_split = no_string.strip().rpartition(comment_type)
try:
comment_position = comment_split.index(comment_type)
except:
comment_position = 3
comment = "".join(list(comment_split[comment_position::]))
return code.replace(comment, "").rstrip()
def removeString(code: str) -> str:
return re.sub(r"""(\"|')([\s\S]*?)(\"|')""", "", code)
def createSeperatorRegex() -> str:
temp = ["\\" + sep for sep in SEPERATORS + ["s+"]]
return f"({'|'.join(temp)})"