词法分析中的奇怪数字

时间:2016-08-06 14:52:07

标签: c lexer

我正在收拾我的词法分析器,但我在打印和正确分配数字时遇到问题。以下是我的输出,应为lexer_num,我认为问题出现在lexer_fltv附近: (包括所要求的更多信息。)我没有或者真的知道十六进制转储是什么,所以我认为我不能提供。要回答关于lexer_num Type: "40" { Line: "1" Pos: "0" Num: "2591542" Real: "0.000000" Stri: "" } #define _CRT_SECURE_NO_WARNINGS #define DEBUG 0 #include "lexer.h" #include "error.h" #include <stdlib.h> #include <stdio.h> #include <stdint.h> #include <ctype.h> #include <assert.h> typedef struct lextoken_t { const char* str; token_t type; } lextoken_t; static const lextoken_t keywords[] = { // types { "int", _int }, { "double", _dbl }, { "void", _void }, { "char", _char }, { "string", _str }, { "bool", _bool }, { "const", _const }, { "struct", _struct } }; /* token_new: creates and returns a new token ptr. ** -lexer: a ptr to the lexer. ** -type: the token type. */ token_t* token_new(lexer_t* lexer, tk_type type) { token_t* token = malloc(sizeof(token_t)); token->line = lexer->line; token->pos = lexer->pos; token->type = type; token->integer = 0; token->flt = 0; token->string = NULL; return token; } static void token_print(token_t* token) { if (token == NULL) printf("Null token"); printf("Type: \"%i\" { \n", token->type); printf("\tLine: \"%i\"\n", token->line); printf("\tPos: \"%i\"\n", token->pos); printf("\tNum: \"%i\"\n", token->integer); printf("\tReal: \"%f\"\n", token->flt); printf("\tStri: \"%s\"\n}\n\n", token->string); } /* lexer_look: look at the source (ahead) places infront of the lexer->ptr. ** -lexer: a ptr to the lexer to look ahead in. ** -ahead: how far ahead of the ptr to look. */ static char lexer_look(lexer_t* lexer, size_t ahead) { if (lexer->len < lexer->ptr + ahead) { error_new(lexer->errors, 0, 0, "The lexer tried to index %d out of bounds %d", lexer->ptr + ahead, lexer->len); return; } return lexer->src[lexer->ptr + ahead]; } static size_t can_adv(lexer_t* lexer, size_t steps) { if (lexer->ptr + steps <= lexer->len) return 1; else return 0; } /* lexer_adv: moves the lexer->ptr (steps) places. ** -lexer: a ptr to the lexer to look ahead in. ** -steps: how far to advance the ptr. */ static char lexer_adv(lexer_t* lexer, size_t steps) { if (!can_adv(lexer, steps)) error_new(lexer->errors, 0, 0, "The lexer tried to move ptr past bounds %d with value of %d", lexer->len, lexer->ptr + steps); lexer->ptr += steps; return lexer->src[lexer->ptr]; } static void new_line(lexer_t* lexer) { lexer->line = 0; lexer->pos = 0; } static void lexer_nested(lexer_t* lexer) { lexer_adv(lexer, 2); char c = lexer_look(lexer, 0); size_t depth = 1; while (depth > 0) { if (!can_adv(lexer, 1)) error_new(lexer->errors, lexer->line, lexer->pos, "Unterminated block comment."); else if (c == '*' && lexer_look(lexer, 1) == '#') { lexer_adv(lexer, 2); depth--; } else if (c == '#' && lexer_look(lexer, 1) == '*') { lexer_adv(lexer, 2); depth++; } else c = lexer_adv(lexer, 1); } } static void lexer_comment(lexer_t* lexer) { if (lexer_look(lexer, 1) == '*') lexer_nested(lexer); else { char c; while (((c = lexer_look(lexer, 0)) != '\n') && can_adv(lexer, 1)) lexer_adv(lexer, 1); new_line(lexer); } } static token_t* lexer_str(lexer_t* lexer) { size_t str_len = 0; while (true) { if (!can_adv(lexer, 1)) { error_new(lexer->errors, lexer->len, lexer->pos, "Unterminated string."); return NULL; } else if (lexer_look(lexer, 1) == '\"') { lexer_adv(lexer, 2); break; } else { lexer_adv(lexer, 1); str_len++; } } char* string = malloc(str_len + 1); for (size_t idx = 0; idx < str_len; idx++) string[idx] = lexer->src[lexer->ptr - str_len + idx]; string[str_len] = '\0'; token_t* token = token_new(lexer, _str); token->string = string; return token; } static token_t* lexer_float(lexer_t* lexer, token_t* token, size_t v) { size_t places = 0; double d = v; if (!isdigit(lexer_look(lexer, 1))) { return token; } while (lexer->len > 0) { char c = lexer_look(lexer, 1); if (isdigit(c)) { lexer_adv(lexer, 1); d = (d * 10) + (c - '0'); places++; } else break; } token->flt = d / (places * 10); token->string = ""; return token; } static token_t* lexer_num(lexer_t* lexer) { token_t* token = token_new(lexer, _int); size_t v = 0; while (can_adv(lexer, 1)) { char c = lexer_look(lexer, 0); if (isdigit(c)) { v = (v * 10) + (c - '0'); lexer_adv(lexer, 1); } else if (c == '.') { lexer_adv(lexer, 1); return lexer_float(lexer, token, v); } else { break; } } token->integer = v; token->string = ""; return token; } static token_t* lexer_ident(lexer_t* lexer) { token_t* token = token_new(lexer, _ident); size_t id_len = 0; while (can_adv(lexer, 1)) { if (!isalpha(lexer_look(lexer, 0))) break; lexer_adv(lexer, 1); id_len++; } char* ident = malloc(id_len + 1); for (size_t idx = 0; idx < id_len; idx++) ident[idx] = lexer->src[lexer->ptr - id_len + idx]; ident[id_len] = '\0'; token->string = ident; return token; } static token_t* next_token(lexer_t* lexer) { token_t* token = NULL; while (token == NULL && can_adv(lexer, 1)) { const int c = lexer_look(lexer, 0); if (DEBUG) printf("Current character: \"%c\", Length: %d, Pointer: %d \n", lexer_look(lexer, 0), lexer->len, lexer->ptr); switch (c) { case '=': if (lexer_look(lexer, 1) == '=') { token = token_new(lexer, _eqto); lexer_adv(lexer, 2); token->string = "=="; } else { token = token_new(lexer, _assign); token->string = "="; lexer_adv(lexer, 1); } break; case '+': if (lexer_look(lexer, 1) == '=') { token = token_new(lexer, _addeql); lexer_adv(lexer, 2); token->string = "+="; } else { token = token_new(lexer, _add); token->string = "+"; lexer_adv(lexer, 1); } break; case '-': if (lexer_look(lexer, 1) == '=') { token = token_new(lexer, _subeql); lexer_adv(lexer, 2); token->string = "-="; } else { token = token_new(lexer, _sub); token->string = "-"; lexer_adv(lexer, 1); } break; case '*': if (lexer_look(lexer, 1) == '=') { token = token_new(lexer, _muleql); lexer_adv(lexer, 2); token->string = "*="; } else { token = token_new(lexer, _mul); token->string = "*"; lexer_adv(lexer, 1); } break; case '/': if (lexer_look(lexer, 1) == '=') { token = token_new(lexer, _diveql); lexer_adv(lexer, 2); token->string = "/="; } else { token = token_new(lexer, _div); token->string = "/"; lexer_adv(lexer, 1); } break; case '<': if (lexer_look(lexer, 1) == '<') { token = token_new(lexer, _nteq); lexer_adv(lexer, 2); token->string = "<="; } else { token = token_new(lexer, _bang); token->string = "<"; lexer_adv(lexer, 1); } break; case '>': if (lexer_look(lexer, 1) == '<') { token = token_new(lexer, _nteq); lexer_adv(lexer, 2); token->string = ">="; } else { token = token_new(lexer, _bang); token->string = ">"; lexer_adv(lexer, 1); } break; case '&': if (lexer_look(lexer, 1) == '&') { token = token_new(lexer, _and); lexer_adv(lexer, 2); token->string = "&&"; } else { token = token_new(lexer, _notype); lexer_adv(lexer, 1); } break; case '|': if (lexer_look(lexer, 1) == '|') { token = token_new(lexer, _or); lexer_adv(lexer, 2); token->string = "||"; } else { token = token_new(lexer, _notype); lexer_adv(lexer, 1); } break; case '%': token = token_new(lexer, _mod); token->string = "%"; lexer_adv(lexer, 1); break; case '^': token = token_new(lexer, _mod); token->string = "^"; lexer_adv(lexer, 1); break; case '!': if (lexer_look(lexer, 1) == '=') { token = token_new(lexer, _nteq); lexer_adv(lexer, 2); token->string = "!="; } else { token = token_new(lexer, _bang); token->string = "!"; lexer_adv(lexer, 1); } break; case '\"': token = lexer_str(lexer); break; case '#': lexer_comment(lexer); break; case '(': token = token_new(lexer, _lpara); token->string = "("; lexer_adv(lexer, 1); break; case ')': token = token_new(lexer, _rpara); token->string = ")"; lexer_adv(lexer, 1); break; case '{': token = token_new(lexer, _lcurl); token->string = "{"; lexer_adv(lexer, 1); break; case '}': token = token_new(lexer, _rcurl); token->string = "}"; lexer_adv(lexer, 1); break; case '[': token = token_new(lexer, _lbrac); token->string = "["; lexer_adv(lexer, 1); break; case ']': token = token_new(lexer, _rbrac); token->string = "]"; lexer_adv(lexer, 1); break; case ';': token = token_new(lexer, _terml); token->string = ";"; lexer_adv(lexer, 1); break; default: if (isalpha(c) || c == '_') token = lexer_ident(lexer); else if (isdigit(c) || c == '.') { token = lexer_num(lexer); } else if (isspace(c)) lexer_adv(lexer, 1); else token = token_new(lexer, _eof); break; } } return token; } void lexer_print(lexer_t* lexer) { size_t line = lexer->line; size_t pos = lexer->pos; size_t ptr = lexer->ptr; token_t* token = next_token(lexer); while (token != NULL && token->type != _eof) { token_print(token); token = next_token(lexer); } lexer->ptr = ptr; lexer->pos = pos; } lexer_t* lexer_open(const char* file_name) { FILE* file_ptr = fopen(file_name, "rb"); lexer_t* lexer = malloc(sizeof(lexer_t)); lexer->errors = errorlist_new(); lexer->line = 1; lexer->pos = 0; lexer->ptr = 0; if (file_ptr == NULL) { error_new(lexer->errors, 0, 0, "Couldent open file \"%s\".\n", file_name); fclose(file_ptr); free(lexer); } if (fseek(file_ptr, 0, SEEK_END) != 0) { fclose(file_ptr); return NULL; } lexer->len = ftell(file_ptr); if (lexer->len == -1) { error_new(lexer->errors, 0, 0, "Unable to get the size of file \"%s\".\n", file_name); fclose(file_ptr); free(lexer); } fseek(file_ptr, 0, SEEK_SET); lexer->src = malloc(lexer->len); size_t r = fread(lexer->src, lexer->len, 1, file_ptr); fclose(file_ptr); return lexer; } void lexer_close(lexer_t* lexer) { if (lexer->src != NULL) free(lexer->src); free(lexer); } 为什么是size_t的问题是因为它们不需要负数,因为否则在解析器中处理而不是词法分析器。

#ifndef LEXER_H
#define LEXER_H

#include "error.h"
#include <stdio.h>
#include <stdbool.h>
#include <malloc.h>
#include <assert.h>

typedef enum tk_type {
    // primitives
    _notype,
    _str,
    _gen_num,
    _ident,
    _type,

    // symbols
    _rbrac,
    _lbrac,
    _rpara,
    _lpara,
    _rcurl,
    _lcurl,
    _terml,

    _assign,
    _bang,

    _add,
    _addeql,
    _sub,
    _subeql,
    _div,
    _diveql,
    _mul,
    _muleql,
    _exp,
    _mod,

    // comparison operators
    _lt,
    _lteq,
    _gt,
    _gteq,
    _eqto,
    _nteq,
    _and,
    _or,

    // keywords
    _while,
    _for,
    _if,
    _else,
    _match,
    _case,
    _return,
    _break,
    _int,
    _float,
    _enum,
    _true,
    _false,
    _import,
    _struct,
    _mac,
    _dbl,
    _void,
    _char,
    _bool,
    _const,

    // abstract
    _block,
    _eof
} tk_type;

typedef struct token_t {
    tk_type type;
    size_t line;
    size_t pos;

    union {
        char* string;
        double flt;
        size_t integer;
    };
} token_t;

typedef struct lexer_t {
    size_t line;
    size_t pos;
    size_t ptr;
    size_t len;
    char* src;

    errorlist_t* errors;
} lexer_t;

void lexer_print(lexer_t* lexer);

#endif

代码:

int main() {
    int var = 10 + 2;
}

标题

{{1}}

输入

{{1}}

1 个答案:

答案 0 :(得分:1)

lexer_int中的明显问题似乎就在最后:

token->integer = v;
token->string = "";

由于token_t包含覆盖integerfltstring字段的匿名联合,因此会存储读取的数字,然后立即用指向静态的指针覆盖它字符串文字""。您想要删除token->string = "";行。

当然,那么你的token_print例程可能会崩溃,因为即使令牌不是字符串,它也会尝试读取string字段。

lexer_float有同样的问题...