如何处理换行符和c中的分隔符?

时间:2018-08-01 14:13:34

标签: c token lexer

enter image description here

实际的文本文件只是用于测试lex和解析的随机内容。上面的图片是结果,控制台在运行时给了我。绿色通常是一个标识符,应该是换行符或delim,因此不需要。红色表示无法识别分隔符,黄色表示无法识别事物。我假设它与前面的c有关;不被分开吧。

所以我的问题是我如何正确地分隔令牌,并识别换行符,或者我做错了什么。下面是我用来进行分离和标记化的代码。

#define _CRT_SECURE_NO_WARNINGS
#include <stdbool.h>
#include <stdio.h>
#include <string.h>
#include <stdlib.h>

#define BUFFER_SIZE    1024

// Returns 'true' if the character is a DELIMITER.
bool isDelimiter(char ch)
{
    if (isspace (ch))
        return (true);
    return (false);
}

// Returns 'true' if the character is a SEPERATOR.
bool isSeperator(char str)
{
    if (str == ',' || str == ';' || str == '>' ||
        str == '<' || str == '(' || str == ')' || str == '[' || str == ']' || 
        str == '{' || str == '}' || str == '.' )
        return (true);
    return (false);
}

// Returns 'true' if the character is an OPERATOR.
bool isOperator(char ch)
{
    if (ch == '+' || ch == '-' || ch == '*' ||
        ch == '/' || ch == '>' || ch == '<' ||
        ch == '=')
        return (true);
    return (false);
}

// Returns 'true' if the string is a VALID IDENTIFIER.
bool validIdentifier(char* str)
{
    if (str[0] == '0' || str[0] == '1' || str[0] == '2' ||
        str[0] == '3' || str[0] == '4' || str[0] == '5' ||
        str[0] == '6' || str[0] == '7' || str[0] == '8' ||
        str[0] == '9' )
        return (false);
    return (true);
}

// Returns 'true' if the string is a KEYWORD.
bool isKeyword(char* str)
{
    if (!strcmp(str, "if") || !strcmp(str, "else") ||
        !strcmp(str, "while") || !strcmp(str, "do") ||
        !strcmp(str, "break") || !strcmp(str, "elem") ||
        !strcmp(str, "lout") || !strcmp(str, "file") ||
        !strcmp(str, "console") || !strcmp(str, "read") ||
        !strcmp(str, "write") || !strcmp(str, "mark") ||
        !strcmp(str, "emblemnize") || !strcmp(str, "lin") ||
        !strcmp(str, "send") || !strcmp(str, "dint") ||
        !strcmp(str, "continue") || !strcmp(str, "int")
        || !strcmp(str, "double") || !strcmp(str, "float")
        || !strcmp(str, "return") || !strcmp(str, "char")
        || !strcmp(str, "case") || !strcmp(str, "char")
        || !strcmp(str, "sizeof") || !strcmp(str, "long")
        || !strcmp(str, "short") || !strcmp(str, "typedef")
        || !strcmp(str, "switch") || !strcmp(str, "unsigned")
        || !strcmp(str, "void") || !strcmp(str, "static")
        || !strcmp(str, "struct") || !strcmp(str, "goto"))
        return (true);
    return (false);
}

// Returns 'true' if the string is an INTEGER.
bool isInteger(char* str)
{
    int i, len = strlen(str);

    if (len == 0)
        return (false);
    for (i = 0; i < len; i++) {
        if (str[i] != '0' && str[i] != '1' && str[i] != '2'
            && str[i] != '3' && str[i] != '4' && str[i] != '5'
            && str[i] != '6' && str[i] != '7' && str[i] != '8'
            && str[i] != '9' || (str[i] == '-' && i > 0))
            return (false);
    }
    return (true);
}

// Returns 'true' if the string is a REAL NUMBER.
bool isRealNumber(char* str)
{
    int i, len = strlen(str);
    bool hasDecimal = false;

    if (len == 0)
        return (false);
    for (i = 0; i < len; i++) {
        if (str[i] != '0' && str[i] != '1' && str[i] != '2'
            && str[i] != '3' && str[i] != '4' && str[i] != '5'
            && str[i] != '6' && str[i] != '7' && str[i] != '8'
            && str[i] != '9' && str[i] != '.' ||
            (str[i] == '-' && i > 0))
            return (false);
        if (str[i] == '.')
            hasDecimal = true;
    }
    return (hasDecimal);
}

// Extracts the SUBSTRING.
char* subString(char* str, int left, int right)
{
    int i;
    char* subStr = (char*)malloc(sizeof(char) * (right - left + 2));

    for (i = left; i <= right; i++)
        subStr[i - left] = str[i];
    subStr[right - left + 1] = '\0';
    return (subStr);
}

// Parsing the input STRING.
void parse(char* str)
{
    int left = 0, right = 0;
    int len = strlen(str);

    while (right <= len && left <= right)
    {
        if (isDelimiter(str[right]) == false)
            right++;


        if (isDelimiter(str[right]) == true && left == right)
        {
            if (isOperator(str[right]) == true)
                printf("'%c' IS A OPERATOR\n", str[right]);

            right++;
            left = right;
        }

        if (isDelimiter(str[right]) == true && left == right)
        {
            if (isDelimiter(str[right]) == true)
                printf("'%c' IS A DELIMITER\n", str[right]);

            right++;
            left = right;
        }

        if (isSeperator(str[right]) == true && left == right)
        {
            //needed to recognize seperator to the right
            if (isSeperator(str[right]) == true)
                printf("'%c' IS A SEPERATOR\n", str[right]);

            right++;
            left = right;

            //needed to recognize seperator to the left
            if (isSeperator(str[right]) == true)
                printf("'%c' IS A SEPERATOR\n", str[left]);

            right++;
            left = right;
        }
        else if (isDelimiter(str[right]) == true && left != right
            || (right == len && left != right)) {
            char* subStr = subString(str, left, right - 1);

            if (isKeyword(subStr) == true)
                printf("'%s' IS A KEYWORD\n", subStr);

            else if (isInteger(subStr) == true)
                printf("'%s' IS AN INTEGER\n", subStr);

            else if (isRealNumber(subStr) == true)
                printf("'%s' IS A REAL NUMBER\n", subStr);

            else if (validIdentifier(subStr) == true
                && isDelimiter(str[right - 1]) == false
                && isSeperator(str[right - 1]) == false)
                printf("'%s' IS A VALID IDENTIFIER\n", subStr);

            left = right;
        }
    }
    return;
}

int main(int argc, char *argv)
{

    /* declare a file pointer */
    FILE    *file;
    char    *buffer;
    long    numbytes;

    /* open an existing file for reading */
    file = fopen("Text.txt", "r");

    /* quit if the file does not exist */
    if (file == NULL)
        return 1;

    /* Get the number of bytes */
    fseek(file, 0L, SEEK_END);
    numbytes = ftell(file);

    /* reset the file position indicator to
    the beginning of the file */
    fseek(file, 0L, SEEK_SET);

    /* grab sufficient memory for the
    buffer to hold the text */
    buffer = (char*)calloc(numbytes, sizeof(char));

    /* memory error */
    if (buffer == NULL)
        return 1;

    /* copy all the text into the buffer */
    fread(buffer, sizeof(char), numbytes, file);

    /* confirm we have read the file by
    outputing it to the console */
    printf("  The file called Text.txt contains this text  \n     \n %s             \n\n", buffer);

    parse(buffer); // calling the parse function
    fclose(file);

    /* free the memory we used for the buffer */
    free(buffer);



    return 0;
}

1 个答案:

答案 0 :(得分:1)

问题似乎出在您的isDelimiter函数未获取所有可能的值。如果将其更改为使用isspace(),它将在所有形式的空格上匹配。

bool isDelimiter(char ch)
{
    if (isspace(ch))
        return (true);
    return (false);
}

作为示例,这是一个非常简单的状态机,可让您了解我的意思。它可以处于INSIDE_IDENTIFIER或OUTSIDE_IDENTIFIER两种状态之一,并且可以根据要查看的字符类型在这两种状态之间进行切换。

#define OUTSIDE_IDENTIFIER (0)
#define INSIDE_IDENTIFIER (1)

void parse(char *str)
    {
    char *ch;
    int state=OUTSIDE_IDENTIFIER;
    char buffer[1000];
    char *pos=buffer;

    for(ch=str;*ch!='\0';ch++)
        {
        switch(state)
            {
        case INSIDE_IDENTIFIER:
            if(isOperator(*ch))
                {
                *pos='\0';
                printf("Identifier[%s]\n",buffer);
                printf("Operator[%c]\n",*ch);
                state=OUTSIDE_IDENTIFIER;
                }
            else if(isDelimiter(*ch))
                {
                *pos='\0';
                printf("Identifier[%s]\n",buffer);
                printf("Delimiter[%c]\n",*ch);
                state=OUTSIDE_IDENTIFIER;
                }
            else if(isspace(*ch))
                {
                *pos='\0';
                printf("Identifier[%s]\n",buffer);
                printf("Space[%d]\n",*ch);
                state=OUTSIDE_IDENTIFIER;
                }
            else
                {
                *pos=*ch;
                pos++;
                }
        break; 
        case OUTSIDE_IDENTIFIER:
        default:
            if(isOperator(*ch))
                {
                printf("Operator[%c]\n",*ch);
                }
            else if(isDelimiter(*ch))
                {
                printf("Delimiter[%c]\n",*ch);
                }
            else if(isSeperator(*ch))
                {
                printf("Seperator[%c]\n",*ch);
                }
            else
                {
                state = INSIDE_IDENTIFIER;
                pos=buffer;
                *pos=*ch;
                pos++;
                }
        break;
            }
        }
    }