词法分析阅读构成源程序的字符流,按编程语言的词法规则把它们组成词法记号流。对于一个词法单元,词法分析产生的记号是:
<记号名, 属性值>
二元组。记号名是同类词法单元共用的名称,而属性值是一个词法单元有别于同类中其他词法单元的特征值。
(1)输入:一个C语言编写的程序(源代码)。
1 2 3 4 5 6 7 8 9 10 11 12 13 14
| #include<stdio.h> int main() { int n; scanf("%d", &n); int* f; f = (int*)malloc((n+1)*sizeof(int)); f[1] = f[2] = 1; int i; for( i=3; i<=n; i++ ) { f[i] = f[i-1] + f[i-2]; } printf("%d", f[n]); return 0; }
|
(2)处理:对输入进行分析,分离出关键字、算符、界符、标识符和常量,判断是否符合构词规则,分类。
思路:
对输入的源代码每行进行词法分析;
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26
| string file; vector<string> v; pf("请输入文件路径:\n"); cin >> file; ifstream ifs; ifs.open(file); if( ifs ) { while( !ifs.eof() ) { char s[512]; ifs.getline(s,512,'\n'); int x = 0; string s0 = ""; while( s[x]!='\0' ) s0 += s[x++]; if( !s0.empty() ) v.push_back(s0); } idnum = 0; pf("-----------------------\n"); vid.clear(); int len = v.size(); for( int i=0; i<len; i++ ) { Lexcial_analyzer(v[i]); } }else { pf("文件打开失败!请重新输入\n"); } ifs.close();
|
第一次分割按照空格和制表符;
1 2 3 4 5 6 7 8 9 10
| for( int i=0; i<len; i++ ) { if( s[i]!=' '&&s[i]!='\t' ) { s0 += s[i]; }else { if( !s0.empty() ) v1.push_back(s0); s0 = ""; }
} if( !s0.empty() ) v1.push_back(s0);
|
第二次按照界符和运算符分割;
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54
| int vlen = v1.size(); for( int i=0; i<vlen; i++ ) { // cout << v[i] << "--asd--" << v[i].size() << endl; string sword = v1[i]; int lsw = sword.size(); string sp = ""; for( int j=0; j<lsw; j++ ) { char* p = find(Delimiter, Delimiter+strlen(Delimiter), sword[j]); if( p!=Delimiter+strlen(Delimiter) ) { if( !sp.empty() ) { string spnew1 = ""; string spnew2 = ""; int len_sp = sp.size(); for( int k=0; k<len_sp; k++ ) { if( ('0'<=sp[k]&&sp[k]<='9')||('a'<=sp[k]&&sp[k]<='z')||('A'<=sp[k]&&sp[k]<='Z')||sp[k]=='_' ) { if( !spnew1.empty() ) v2.push_back(spnew1); spnew1 = ""; spnew2 += sp[k]; }else { if( !spnew2.empty() ) v2.push_back(spnew2); spnew2 = ""; spnew1 += sp[k]; } } if( !spnew1.empty() ) v2.push_back(spnew1); if( !spnew2.empty() ) v2.push_back(spnew2); } sp = ""; string sschar = ""; sschar += sword[j]; v2.push_back(sschar); }else { sp += sword[j]; } } if( !sp.empty() ) { string spnew1 = ""; string spnew2 = ""; int len_sp = sp.size(); for( int k=0; k<len_sp; k++ ) { if( ('0'<=sp[k]&&sp[k]<='9')||('a'<=sp[k]&&sp[k]<='z')||('A'<=sp[k]&&sp[k]<='Z')||sp[k]=='_' ) { if( !spnew1.empty() ) v2.push_back(spnew1); spnew1 = ""; spnew2 += sp[k]; }else { if( !spnew2.empty() ) v2.push_back(spnew2); spnew2 = ""; spnew1 += sp[k]; } } if( !spnew1.empty() ) v2.push_back(spnew1); if( !spnew2.empty() ) v2.push_back(spnew2); } }
|
然后对头文件的文件名和左右尖括号特判,
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32
| bool isj = false; bool isdeli = false; string filename = ""; vlen = v2.size(); for( int i=0; i<vlen; i++ ) { if( v2[i]==">" ) isj = false; if( isj ) { filename += v2[i]; }else { // string* p = find(Keyword, Keyword+33, v2[i]); if( !filename.empty() ) { pf("%16s: <filename, %s>\n", filename.c_str(), filename.c_str()); filename = ""; } // cout << v2[i] << endl; if( v2[i]=="<"&&isd ) { pf("%16c: <Delimiter, 13>\n", '<'); }else if( isd&&v2[i]==">" ) { pf("%16c: <Delimiter, 14>\n", '>'); isd = false; }else { D data = F(v2[i]); if( data.pos<0 ) { pf("%16s: ERROR!!!\n", data.name.c_str()); }else { pf("%16s: <%s, %d>\n", data.name.c_str(), data.type.c_str(), data.pos); } } } if( isd&&v2[i]=="<" ) isj = true; }
|
再将分割好的每一个词分成关键字、算符、界符、标识符、常量和不符合构词规则六类;
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90
| D F(string s) { int l = s.size(); D data; data.name = s; if( l==1 ) { if( '0'<=s[0]&&s[0]<='9' ) { data.type = "Number"; data.pos = (int)(s[0]-'0'); }else if( ('a'<=s[0]&&s[0]<='z')||('A'<=s[0]&&s[0]<='Z') ) { data.type = "ID"; vector<string>::iterator iter = find(vid.begin(), vid.end(), s); if( iter!=vid.end() ) { data.pos = iter-vid.begin(); }else { vid.push_back(s); data.pos = idnum; idnum++; } }else { data.type = "Delimiter"; int i; for( i=0; i<13; i++ ) { if( Delimiter[i]==s[0] ) { data.pos = i; break; } } if( i>=13 ){ data.type = "Operator"; int j; for( int j=0; j<38; j++ ) { if( Operator[i]==s ) { data.pos = i; break; } } if( j>=38 ) data.pos = -1; } } }else { if( ('a'<=s[0]&&s[0]<='z')||('A'<=s[0]&&s[0]<='Z')||s[0]=='_' ) { int i; for( i=0; i<33; i++ ) { if( Keyword[i]==s) { data.pos = i; break; } } if( i>=33 ) { int j; for( j=1; j<s.size(); j++ ) { if( ( ('0'<=s[j]&&s[j]<='9')||('a'<=s[j]&&s[j]<='z')||('A'<=s[j]&&s[j]<='Z')||s[j]=='_' ) ) { break; } } if( j>=s.size() ) { data.pos = -1; } else { data.type = "ID"; vector<string>::iterator iter = find(vid.begin(), vid.end(), s); if( iter!=vid.end() ) { data.pos = iter-vid.begin(); }else { vid.push_back(s); data.pos = idnum; idnum++; } } }else { data.type = "Keyword"; } }else { data.type = "Operator"; int i; for( i=0; i<38; i++ ) { if( Operator[i]==s) { data.pos = i; break; } } if( i>=38 ) data.pos = -1; } } return data; }
|
(3)输出:对应的二元式。
二元式构成为<type, value>,其中type包含Keyword、Delimiter、Operator、ID、Number,前三个的value是它们所在数组的下标,ID的value是出现的次序,Number的value就是它本身的值。