HTML标签签入C

时间:2012-10-24 09:47:51

标签: c parsing

我正在用C编写一个小程序,以检查HTML文件是否有正确的打开和关闭标签? 但我有一些问题...... 我有一个文件包含所有可能的标签,名为tags.txt(这些只是第一个):

<a>
</a>
<abbr>
</abbr>
<area>
</area>
<aside>
</aside>

我有htmlfile.html,我必须检查:

<!--#echo var="date" -->
<area>
</area>
<area>
</area>

其次,我想将这样的注释替换为sysdate 比如,格式是可以的,我可以做到,但编程放入文件 这个

我的代码:

#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <time.h>

#define MAX_SIZE 512


void menu();
void check();
void datumos();


int main(int argc,char *argv[])
{
    menu();

    return 0;
}

void menu()
{
    char menu[MAX_SIZE];
    while(1 < 2)
    {
            printf("\npress a button:\n\n");

                    printf("\tFile HTML check..............:c\n");
                    printf("\t<!--#echo var="date" -->...........:d\n");
                    printf("\tExit:\tCTRL + C\n");
                    scanf("%s",menu);

            if( strcmp(menu,"c") == 0 )
            {
                    check();
            }
            else if( strcmp(menu,"d") == 0 )
            {
                    datumos();
            }

    }
}

void check()
{
    FILE *htmlfile;
    FILE *checkfile;

    htmlfile = fopen("htmlfile.html","w");
    checkfile = fopen("tags.txt","r");

char line[MAX_SIZE];
char htmlline[MAX_SIZE];
char tags[189][30];


int i=0;

printf("\tcheck__1\n");

while(fgets(line,sizeof(line),checkfile) != NULL)
    {

        int j;
    for(j=0; j<sizeof(line); ++j)
    {
        tags[i][j]=line[j];
    }
    ++i;

    }
printf("\tcheck__2\n");


int k=0;    char htmlfiletags[MAX_SIZE][30];
    while(fgets(htmlline,sizeof(htmlline),htmlfile) != NULL)
    {
    char currentline[sizeof(htmlline)];
    int j=0;

        if( currentline[j]=="<" )
        {

                while(currentline[j]!=">") 
                {
                    htmlfiletags[k][j]=currentline[j];
                    ++j;
                }
                strcat(htmlfiletags[k][j+1],">"); 
                ++k; 
        }
}
printf("\tcheck__3\n");


 int n;
 for(n=0; n<sizeof(htmlfiletags); ++n)
 {
     int j; int howmanytimesnot=0;

     for(j=0; j<sizeof(tags); ++j)
     {
         printf("\tcheck__3/1\n");


         if(strcmp(htmlfiletags[n],tags[j])==0)
         {
             printf("\t%d\n", howmanytimesnot);

             ++howmanytimesnot;
         }
     }

    printf("\tcheck__3/3\n");

     if(!(howmanytimesnot<sizeof(tags)))
        {
            printf("\tcheck__3/4\n");
          printf("the file is not wellformed");

          exit (1);
        }

 }
 printf("\tcheck__4\n");


}

void copy_file(const char *from,const char *to)
{
    FILE *fr;
    FILE *t;
    fr = fopen(from,"r");
    t = fopen(to,"w");

    char line[MAX_SIZE];

    char row[MAX_SIZE];

    while(fgets(line,sizeof(line),fr) != NULL)
    {
            sscanf(line,"%s",row);
            fprintf(t,"%s\n",row);
    }

    fclose(fr);
    fclose(t);

    remove("tempfile.html");
 }


void datumos()
{
time_t now = time(NULL);
struct tm *t = localtime(&now);
char date_time[30];
strftime( date_time, sizeof(date_time), "%x_%X", t );

FILE *htmlfile;
    FILE *tempfile;
    htmlfile = fopen("htmlfile.html","r");
    tempfile = fopen("tempfile.html","w");
    char line[MAX_SIZE];
    //char datecomment[]="<!--#echo var=date -->";

    while(fgets(line,sizeof(line),htmlfile) != NULL)
    {

            if( strcmp(line,"<!--#echo var="date" -->") == 0 )
            {

            char row[40];
            strcpy(row,"<!--");
            strcat(row, date_time);
            strcat(row,"-->");

    printf("%s",row);
            fputs(row,tempfile);

            }
            else
            {
                    fputs(line,tempfile);
            }
    }

    fclose(htmlfile);
    fclose(tempfile);

    copy_file("tempfile.html","htmlfile.html");

}

它死在这里,在内部for循环中,if在第200次检查......我不知道为什么......

 int n;
 for(n=0; n<sizeof(htmlfiletags); ++n)
 {
     int j; int howmanytimesnot=0;

     for(j=0; j<sizeof(tags); ++j)
     {
         printf("\tcheck__3/1\n");


         if(strcmp(htmlfiletags[n],tags[j])==0)
         {
             printf("\t%d\n", howmanytimesnot);

             ++howmanytimesnot;
         }
     }

    printf("\tcheck__3/3\n");

     if(!(howmanytimesnot<sizeof(tags)))
        {
            printf("\tcheck__3/4\n");
          printf("the file is not wellformed");

          exit (1);
        }

 }

感谢所有回复!! ģ

4 个答案:

答案 0 :(得分:1)

您的代码非常复杂,它有几个问题。

这是一个:

for(j=0; j<sizeof(tags); ++j)

这不会做我认为你期望的事情; sizeof(tags)不是tags的数组长度(声明为char tags[189][30];),而是变量的总大小。因此,这个循环将从0到189 * 30 - 1,即5669,因此索引输出超出数组的末尾。

此外,在这里以任何方式使用sizeof的想法是错误的,因为tags的内容来自文件,因此编译器无法知道。请记住,对于像这样的表达式,在编译时评估sizeof

您需要为从标记文件解析的每一行增加一个变量(例如size_t num_tags),并且稍后用于迭代tags

答案 1 :(得分:0)

Do not use regex或某种字符串解析,用于解析HTML。而是在网络or this site上搜索c库来解析html。然后检查解析的HTML文件中的标签。这样可以简化开发过程,因为您不必自己解析文件。

答案 2 :(得分:0)

我修好了一些东西,但是 - 我仍然无法检查文件的htmltags,死在同一个循环,我已经修复了标签数组的分配 - 当在htmlfile中有2个或更多不同的注释并且我正在替换注释时,程序将其替换为sysdate,但是程序会严重复制另一个注释,例如=&gt;

代码现在是:

    #include <stdlib.h>
    #include <stdio.h>
    #include <string.h>
    #include <time.h>

    #define MAX_SIZE 512


    void menu();
    void check();
    void datumos();


    int main(int argc,char *argv[])
    {
        menu();

        return 0;
    }

    void menu()
    {
        char menu[MAX_SIZE];
        while(1 < 2)
        {
                printf("\npress a button:\n\n");

                        printf("\tFile HTML check..............:c\n");
                        printf("\t<!--#echo var="date" -->...........:d\n");
                        printf("\tExit:\tCTRL + C\n");
                        scanf("%s",menu);

                if( strcmp(menu,"c") == 0 )
                {
                        check();
                }
                else if( strcmp(menu,"d") == 0 )
                {
                        datumos();
                }

        }
    }

    void check()
    {
        FILE *htmlfile;
        FILE *checkfile;

        htmlfile = fopen("htmlfile.html","r");
        checkfile = fopen("tags.txt","r");

        char line[MAX_SIZE];
        char htmlline[MAX_SIZE];


        int i2=0;

        printf("\tcheck__1\n");
        while(fgets(line,sizeof(line),checkfile) != NULL)
        {
            ++i2;

        }


        char tags[i2][20];

        int i=0;

        printf("\tcheck__11\n");
        while(fgets(line,sizeof(line),checkfile) != NULL)
        {
            int j;
            for(j=0; j<sizeof(line); ++j)
            {

                tags[i][j]=line[j];
            }
            ++i;

        }
        printf("\tcheck__2\n");

        int k=0;    char htmlfiletags[MAX_SIZE][30];
        while(fgets(htmlline,sizeof(htmlline),htmlfile) != NULL)
        {
            char currentline[sizeof(htmlline)];
            int j=0;

                if( currentline[j]=="<" )
                {

                        while(currentline[j]!=">")
                        {
                            htmlfiletags[k][j]=currentline[j];
                            ++j;
                        }
                        strcat(htmlfiletags[k][j+1],">");
                        ++k;
                }
        }
        printf("\tcheck__3\n");

         int n;
         for(n=0; n<sizeof(htmlfiletags); ++n)
         {
             int j; int howmanytimesnot=0;

             for(j=0; j<sizeof(tags); ++j)
             {
                 //printf("\tcheck__3/1\n");

                 if(strcmp(htmlfiletags[n],tags[j])==0)
                 {
                   //  printf("\t%d\n", howmanytimesnot);
                     ++howmanytimesnot;
                 }
             }

            printf("\tcheck__3/3\n");

             if(!(howmanytimesnot<sizeof(tags)))
                {
                    printf("\tcheck__3/4\n");
                  printf("the file is not wellformed");

                  exit (1);
                }

         }
         printf("\tcheck__4\n");

    }

    void copy_file(const char *from,const char *to)
    {
        FILE *fr;
        FILE *t;
        fr = fopen(from,"r");
        t = fopen(to,"w");

        char line[MAX_SIZE];
        char row[MAX_SIZE];

        while(fgets(line,sizeof(line),fr) != NULL)
        {
            sscanf(line,"%s",row);
            fprintf(t,"%s\n",row);
        }

        fclose(fr);
        fclose(t);

        remove("tempfile.html");
    }


    void datumos()
    {
    time_t now = time(NULL);
    struct tm *t = localtime(&now);
    char date_time[30];
    strftime( date_time, sizeof(date_time), "%x_%X", t );

        FILE *htmlfile;
        FILE *tempfile;

        htmlfile = fopen("htmlfile.html","r");
        tempfile = fopen("tempfile.html","w");

        char line[MAX_SIZE];
        char* datecomment="<!--#echo var=\"date\" -->";

        while(fgets(line,sizeof(line),htmlfile) != NULL)
        {

            int i3; int db=0;
            for(i3=0; i3<strlen(datecomment); ++i3)
            {
                if(line[i3]==datecomment[i3])
                {
                    ++db;
                }

            }

            if(db==strlen(datecomment))
            {

            char row[30];
            strcpy(row,"<!--");
            strcat(row, date_time);
            strcat(row,"-->\n");

            fputs(row,tempfile);

            }
            else
            {
                fputs(line,tempfile);
            }

        }

        fclose(htmlfile);
        fclose(tempfile);

        copy_file("tempfile.html","htmlfile.html");

    }

答案 3 :(得分:0)

  • 当前行没有必要,我也修了比较

    while(fgets(htmlline,sizeof(htmlline),htmlfile) != NULL)
    {           
        int j=0;
    
            if( htmlline[j]=='<' )
            {
    
                    while(htmlline[j]!='>')
                    {
                        htmlfiletags[k][j]=htmlline[j];
                        ++j;
                    }
                    strcat(htmlfiletags[k][j+1],">");
                    ++k;
            }
    }
    

- 此外,另一个问题是只更换合适的评论,并且不会伤害不同的评论仍然无法正常工作

“所以它取代了

  <!--#echo var="date" --> to the sysdate, it's ok, but when there are different comments like

  <!--#include something -->, it wont be copied back well, in the htmlfile will be only <!--#include"

想法?