自己动手写编译器:创建由 C 语言编译而成的语法解析器

望月从良

发布于 2023-11-06 17:12:26

8160

在上一章节，我们完成了由 c 语言设计的输入系统，本节我们看看如何在前一节的基础上完成一个由 c 语言设计并编译出来的词法解析器。整个解析器的基本设计思路是： 1，由我们上一节设计的输入系统将字符串从文件中读入。 2，由我们前面 GoLex 程序设计生成的状态机代码负责读入步骤 1 读入的字符串进行识别。 3，由 c 语言设计的模板代码驱动步骤1 和 2 的执行我们看看具体的操作情况。首先我们需要将上一节设计的输入系统对应的函数放入头文件，在 CLex 项目中增加一个头文件l.h，其代码内容如下：

#ifndef __L_H
#define __L_H
extern  int ii_newfile(char* name);
extern  unsigned char* ii_text();
extern  int ii_flush(int force);
extern  int ii_length();
extern  int ii_lineno();
extern  unsigned char* ii_ptext();
extern  int ii_plength();
extern  int ii_plineno();
extern  unsigned  char* ii_mark_start();
extern  unsigned  char* ii_mark_end();
extern  unsigned  char* ii_move_start();
extern  unsigned  char* ii_to_mark();
extern  unsigned  char* ii_mark_prev();
extern  int ii_advance();
extern  int ii_flush(int force);
extern  int ii_fillbuf(unsigned  char* starting_at);
extern  int ii_look(int n);
extern  int ii_pushback(int n);
extern  void ii_term();
extern  void ii_unterm();
extern  int ii_input();
extern  void ii_unput(int c);
extern  int ii_lookahead(int n);
extern  int ii_flushbuf();
#endif

接着在 GoLex 中生成状态机的 c 语言代码，在 main.go 中代码如下（这些代码我们在前面章节讲解和调试过）:

func main() {
    lexReader, _ := nfa.NewLexReader("input.lex", "output.py")
    lexReader.Head()
    parser, _ := nfa.NewRegParser(lexReader)
    start := parser.Parse()
    parser.PrintNFA(start)

    nfaConverter := nfa.NewNfaDfaConverter()
    nfaConverter.MakeDTran(start)
    nfaConverter.PrintDfaTransition()

    nfaConverter.MinimizeDFA()
    fmt.Println("---------new DFA transition table ----")
    nfaConverter.PrintMinimizeDFATran()

    //nfaConverter.DoPairCompression()
    nfaConverter.DoSquash()
    nfaConverter.PrintDriver()
}

上面代码运行后，我们会在本地生成 lex.yy.c 的文件，我们将文件中的所有代码拷贝到 CLex 的 main.c 文件中。接下来我们需要一段驱动输入系统读入数据，然后调用生成的状态机代码进行字符串识别的”胶水“代码，其名为 yylex,依然在 main.c 中，输入 yylex 函数对应的代码如下：

int yylex() {
    static int yystate = -1;
    int yylastaccept;
    int yyprev;
    int yynstate;
    int yylook;  //预读取字符
    int yyanchor;

    if(yystate == -1) {
        //将数据读入缓冲区
        ii_advance();
        //ii_advance 使得 Next 指针移动了一位，因此在我们还没有读入任何字符时，需要将其后退回去
        ii_pushback(1);
    }

    yystate = 0;
    yyprev = 0;
    yylastaccept = 0;
    ii_unterm();
    ii_mark_start();
    while(1) {
        /*
        * 这里我们采取贪婪算法，如果当前识别的字符串已经进入识别状态，
        * 但是还有字符可以读取，那么我们先缓存当前识别状态，然后继续识别后续字符，
        * 直到文件抵达末尾或者输入的字符导致识别失败，此时我们再返回到上次识别状态
        * 进行处理，这种方法让我们尽可能获得能进入完成状态的最长字符串
        */
        while(1) {
            yylook = ii_look(1);
            if (yylook != EOF) {
                yynstate = yy_next(yystate, yylook);
                break;
            } else {
                if (yylastaccept) {
                    /*
                     * 如果文件数据读取完毕，并且我们抵达过完成状态，那么设置下一个状态为
                     * 非法状态
                     */
                    yynstate = YYF;
                    break;
                }
                else if(yywrap()) {
                    yytext = "";
                    yyleng = 0;
                    return 0;
                }
                else {
                    ii_advance();
                    ii_pushback(1);
                }
            }
        }// inner while

        if (yynstate != YYF) {
            //跳转到下一个有效状态
            printf("Transation from state %d ", yystate);
            printf(" to state %d on <%c>\n", yynstate, yylook);

            if (ii_advance() < 0) {
                //缓冲区已满
                printf("Line %d, lexeme too long. Discarding extra characters.\n", ii_lineno());
                ii_flush(1);
            }

            yyanchor = Yyaccept[yynstate];
            if (yyanchor) {
                yyprev = yystate;
                yylastaccept = yynstate;
                ii_mark_end(); //完成一个字符串的识别
            }

            yystate = yynstate;
        } else {
            //跳转到无效状态，说明输入字符串不合法
            if (!yylastaccept) {
                //忽略掉非法字符
                printf("Ignoring bad input\n");
                ii_advance();
            } else {
                //回退到上一次接受状态
                ii_to_mark();
                if (yyanchor & 2) {
                    //末尾匹配,将末尾回车符号放回缓冲区
                    ii_pushback(1);
                }
                if (yyanchor & 1) {
                    //开头匹配，忽略掉字符串开头的回车符号
                    ii_move_start();
                }
                ii_term();
                //获取当前识别的字符串，极其长度和所在行号
                yytext = (char*) ii_text();
                yyleng = ii_length();
                yylineno = ii_lineno();

                printf("Accepting state%d, ", yylastaccept);
                printf("line %d: <%s>\n", yylineno, yytext);

                switch (yylastaccept) {
                    /*
                     * 这里根据接受状态执行其对应的代码，实际上这里的代码
                     * 后面会由 GoLex 生成
                     */
                    case 3:
                    case 4:
                        printf("%s is a float number", yytext);
                        return FCON;
                    default:
                        printf("internal error, yylex: unkonw accept state %d.\n", yylastaccept);
                        break;
                }
            }

            ii_unterm();
            yylastaccept = 0;
            yystate = yyprev;

        }

    }// outer while
}

最后我们在 main 函数中调用 yylex 函数，驱动识别进程，main 函数内容如下：

int main() {
    int fd = ii_newfile("/Users/my/Documents/CLex/num.txt");
    if (fd == -1) {
        printf("value of errno: %d\n", errno);
    }
    yylex();
    return 0;
}

完成上面代码后，我们就对 c 语言代码进行编译，生成可执行文件，注意在上面代码中，我们使用输入系统的 ii_newfile 函数读入了一个名为 num.txt 的文件，这个文件的内容包含要识别的字符串，实际上这个文件地址可以作为程序参数输入，这里为了简单，我们直接写入代码中，在本地创建文件 num.txt，在里面输入一个数字字符串 3.14 然后保存，最后我们执行 c 语言代码编译的程序，输出结果如下：

Transation from state 0  to state 4 on <3>
Transation from state 4  to state 3 on <.>
Transation from state 3  to state 3 on <1>
Transation from state 3  to state 3 on <4>
Accepting state3, line 1: <
3.14>

这里我们可以看到，创建的 c 语言代码能正确的识别给定文件里的字符串为浮点数，同时他打印出了状态机在识别每个字符时的状态跳转，由此基本断定，我们 c 语言代码的设计基本正确，下一节我们的目的是将当前”手动“的阶段全部用程序来替代，例如将 GoLex 生成的代码进行粘贴等操作我们都用代码来完成，当这些代码生成和代码粘贴的动作都由 GoLex 完成后，那么它就变成了在编译原理工具链里有名的 Flex 应用，更多详细内容，请大家在 b 站搜索

本文参与腾讯云自媒体同步曝光计划，分享自微信公众号。

原始发表：2023-11-05，如有侵权请联系 cloudcommunity@tencent.com 删除

编译