词法分析

词法分析阅读构成源程序的字符流,按编程语言的词法规则把它们组成词法记号流。对于一个词法单元,词法分析产生的记号是:
<记号名, 属性值>
二元组。记号名是同类词法单元共用的名称,而属性值是一个词法单元有别于同类中其他词法单元的特征值。

(1)输入:一个C语言编写的程序(源代码)。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
#include<stdio.h>
int main() {
int n;
scanf("%d", &n);
int* f;
f = (int*)malloc((n+1)*sizeof(int));
f[1] = f[2] = 1;
int i;
for( i=3; i<=n; i++ ) {
f[i] = f[i-1] + f[i-2];
}
printf("%d", f[n]);
return 0;
}

(2)处理:对输入进行分析,分离出关键字、算符、界符、标识符和常量,判断是否符合构词规则,分类。
思路:
对输入的源代码每行进行词法分析;

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
string file;
vector<string> v;
pf("请输入文件路径:\n");
cin >> file;
ifstream ifs;
ifs.open(file);
if( ifs ) {
while( !ifs.eof() ) {
char s[512];
ifs.getline(s,512,'\n');
int x = 0;
string s0 = "";
while( s[x]!='\0' ) s0 += s[x++];
if( !s0.empty() ) v.push_back(s0);
}
idnum = 0;
pf("-----------------------\n");
vid.clear();
int len = v.size();
for( int i=0; i<len; i++ ) {
Lexcial_analyzer(v[i]);
}
}else {
pf("文件打开失败!请重新输入\n");
}
ifs.close();

第一次分割按照空格和制表符;

1
2
3
4
5
6
7
8
9
10
for( int i=0; i<len; i++ ) {
if( s[i]!=' '&&s[i]!='\t' ) {
s0 += s[i];
}else {
if( !s0.empty() ) v1.push_back(s0);
s0 = "";
}

}
if( !s0.empty() ) v1.push_back(s0);

第二次按照界符和运算符分割;

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
int vlen = v1.size();
for( int i=0; i<vlen; i++ ) {
// cout << v[i] << "--asd--" << v[i].size() << endl;
string sword = v1[i];
int lsw = sword.size();
string sp = "";
for( int j=0; j<lsw; j++ ) {
char* p = find(Delimiter, Delimiter+strlen(Delimiter), sword[j]);
if( p!=Delimiter+strlen(Delimiter) ) {
if( !sp.empty() ) {
string spnew1 = "";
string spnew2 = "";
int len_sp = sp.size();
for( int k=0; k<len_sp; k++ ) {
if( ('0'<=sp[k]&&sp[k]<='9')||('a'<=sp[k]&&sp[k]<='z')||('A'<=sp[k]&&sp[k]<='Z')||sp[k]=='_' ) {
if( !spnew1.empty() ) v2.push_back(spnew1);
spnew1 = "";
spnew2 += sp[k];
}else {
if( !spnew2.empty() ) v2.push_back(spnew2);
spnew2 = "";
spnew1 += sp[k];
}
}
if( !spnew1.empty() ) v2.push_back(spnew1);
if( !spnew2.empty() ) v2.push_back(spnew2);
}
sp = "";
string sschar = "";
sschar += sword[j];
v2.push_back(sschar);
}else {
sp += sword[j];
}
}
if( !sp.empty() ) {
string spnew1 = "";
string spnew2 = "";
int len_sp = sp.size();
for( int k=0; k<len_sp; k++ ) {
if( ('0'<=sp[k]&&sp[k]<='9')||('a'<=sp[k]&&sp[k]<='z')||('A'<=sp[k]&&sp[k]<='Z')||sp[k]=='_' ) {
if( !spnew1.empty() ) v2.push_back(spnew1);
spnew1 = "";
spnew2 += sp[k];
}else {
if( !spnew2.empty() ) v2.push_back(spnew2);
spnew2 = "";
spnew1 += sp[k];
}
}
if( !spnew1.empty() ) v2.push_back(spnew1);
if( !spnew2.empty() ) v2.push_back(spnew2);
}
}

然后对头文件的文件名和左右尖括号特判,

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
bool isj = false;
bool isdeli = false;
string filename = "";
vlen = v2.size();
for( int i=0; i<vlen; i++ ) {
if( v2[i]==">" ) isj = false;
if( isj ) {
filename += v2[i];
}else {
// string* p = find(Keyword, Keyword+33, v2[i]);
if( !filename.empty() ) {
pf("%16s: <filename, %s>\n", filename.c_str(), filename.c_str());
filename = "";
}
// cout << v2[i] << endl;
if( v2[i]=="<"&&isd ) {
pf("%16c: <Delimiter, 13>\n", '<');
}else if( isd&&v2[i]==">" ) {
pf("%16c: <Delimiter, 14>\n", '>');
isd = false;
}else {
D data = F(v2[i]);
if( data.pos<0 ) {
pf("%16s: ERROR!!!\n", data.name.c_str());
}else {
pf("%16s: <%s, %d>\n", data.name.c_str(), data.type.c_str(), data.pos);
}
}

}
if( isd&&v2[i]=="<" ) isj = true;
}

再将分割好的每一个词分成关键字、算符、界符、标识符、常量和不符合构词规则六类;

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
D F(string s) {
int l = s.size();
D data;
data.name = s;

if( l==1 ) {
if( '0'<=s[0]&&s[0]<='9' ) {
data.type = "Number";
data.pos = (int)(s[0]-'0');
}else if( ('a'<=s[0]&&s[0]<='z')||('A'<=s[0]&&s[0]<='Z') ) {
data.type = "ID";
vector<string>::iterator iter = find(vid.begin(), vid.end(), s);
if( iter!=vid.end() ) {
data.pos = iter-vid.begin();
}else {
vid.push_back(s);
data.pos = idnum;
idnum++;
}

}else {
data.type = "Delimiter";
int i;
for( i=0; i<13; i++ ) {
if( Delimiter[i]==s[0] ) {
data.pos = i;
break;
}
}
if( i>=13 ){
data.type = "Operator";
int j;
for( int j=0; j<38; j++ ) {
if( Operator[i]==s ) {
data.pos = i;
break;
}
}
if( j>=38 ) data.pos = -1;
}

}
}else {
if( ('a'<=s[0]&&s[0]<='z')||('A'<=s[0]&&s[0]<='Z')||s[0]=='_' ) {
int i;
for( i=0; i<33; i++ ) {
if( Keyword[i]==s) {
data.pos = i;
break;
}
}
if( i>=33 ) {
int j;
for( j=1; j<s.size(); j++ ) {
if( ( ('0'<=s[j]&&s[j]<='9')||('a'<=s[j]&&s[j]<='z')||('A'<=s[j]&&s[j]<='Z')||s[j]=='_' ) ) {
break;
}
}
if( j>=s.size() ) {
data.pos = -1;
} else {
data.type = "ID";
vector<string>::iterator iter = find(vid.begin(), vid.end(), s);
if( iter!=vid.end() ) {
data.pos = iter-vid.begin();
}else {
vid.push_back(s);
data.pos = idnum;
idnum++;
}
}

}else {
data.type = "Keyword";
}
}else {
data.type = "Operator";
int i;
for( i=0; i<38; i++ ) {
if( Operator[i]==s) {
data.pos = i;
break;
}
}
if( i>=38 ) data.pos = -1;
}
}

return data;
}

(3)输出:对应的二元式。
二元式构成为<type, value>,其中type包含Keyword、Delimiter、Operator、ID、Number,前三个的value是它们所在数组的下标,ID的value是出现的次序,Number的value就是它本身的值。