Question

我需要在html文件中搜索标题（字符串）。为此，我做了strstr来获得标签＆＃34; li＆＃34;其中包含标签＆＃34; title = \＆＃34;，其中包含我想要的字符串。

例如：使用下面的这个数组，我需要获得书名，内部标题。但是，我需要html体内的所有标题，它有数百个。

<li><i><a href="/wiki/Animal_Farm" title="Animal Farm">A Revolução dos Bichos</a></i> (<a href="/wiki/1945" title="1945">1945</a>), de <a href="/wiki/George_Orwell" title="George Orwell">George Orwell</a>.</li>

我试图为＃34;＆＃34;使用strlen获取其循环条件（行长度）。在这里面，我使用strstr获得标题=＆＃34;最后复制字符串直到引号结束。

类似的东西：

for (i=0, i<len, i++){
    if(strstr(array[i] == " title=\""){
        do{
    temp[i] = array[i];
          }while((strcmp(array[i], "\""));
    }
}

这就是我挣扎的观点。如何使用模式（条件）获取字符串，字符串内部？有什么建议吗？

提前谢谢！问候。

Answer 1

HTML解析＆＃34;正确的方式＆＃34;比一次检查一个字符串更复杂。我的下面的代码比其他方式做的事情不更多 - 但部分原因是由于缺乏信息。

您的HTML格式正确吗？ title属性是否可以包含字符串li或title，或者流浪<或>字符？您是否需要考虑标签内部可能出现空格，例如< li >？所有属性都是用双引号"编写的，还是可以使用单引号'？

我的代码显示了HTML解析的一般概念：从一个<跳到下一个并检查其后面的HTML命令。但正如你所看到的那样，它很丑陋，虽然它可以完成这项任务，但却无法实现。

对于明确定义的参数中的快速工作，它可能会这样做;对于所有其他人，寻找一个通用的HTML解析库，它将使您免受上述警告的影响，并为元素和属性提供用户友好的界面。

#include <stdio.h>
#include <string.h>
#include <ctype.h>

int main()
{
    char str[] = "<li><i><a href=\"/wiki/Animal_Farm\" title=\"Animal Farm\">A Revolução dos Bichos</a></i> (<a href=\"/wiki/1945\" title=\"1945\">1945</a>), de <a href=\"/wiki/George_Orwell\" title=\"George Orwell\">George Orwell</a>.</li>"
                "<li><i><a href=\"/wiki/Animal_Farm_II\" title=\"Animal Farm II: Return of the Hog\">A Revolução dos Bichos</a></i> (<a href=\"/wiki/1945\" title=\"1945\">1945</a>), de <a href=\"/wiki/George_Orwell\" title=\"George Orwell\">George Orwell</a>.</li>";
    char *html_walker;
    html_walker = str;
    do
    {
        html_walker = strstr(html_walker, "<");
        if (!html_walker)
            break;
        /* Is this "LI"? */
        if (!strncasecmp(html_walker+1, "LI", 2) &&
            !isalnum(html_walker[3]))
        {
            /* Yes. Scan following HTML entries for 'title' until we find an "</LI>" */
            do
            {
                /* an "</LI>" code. Bye. */
                if (*html_walker == '<')
                {
                    html_walker++;
                    if (!strncasecmp(html_walker+1, "/LI", 3) &&
                        !isalnum(html_walker[4]))
                    {
                        while (*html_walker && *html_walker != '>')
                            html_walker++;
                        if (*html_walker == '>')
                            html_walker++;
                        break;
                    }
                    /* Not an "</LI>" code. Look for 'title' */
                    while (*html_walker && *html_walker != '>')
                    {
                        if (isspace (*html_walker) &&
                            !strncasecmp(html_walker+1, "TITLE=\"", 7))
                        {
                            html_walker += 8;
                            printf ("title [");
                            while (*html_walker && *html_walker != '"')
                            {
                                printf ("%c", *html_walker);
                                html_walker++;
                            }
                            printf ("]\n"); fflush (stdout);
                            /* We found a title, so skip to next </LI> */
                            do
                            {
                                html_walker = strstr(html_walker, "<");
                                if (!html_walker)
                                    break;
                                /* Is this "/LI"? */
                                if (!strncasecmp(html_walker+1, "/LI", 3) &&
                                    !isalnum(html_walker[4]))
                                    break;
                                html_walker++;
                            } while (html_walker && *html_walker);
                            break;
                        }
                        html_walker++;
                    }
                    if (*html_walker == '>')
                        html_walker++;
                } else
                {
                    html_walker++;
                }
            } while (*html_walker);
        } else
        {
            /* Skip forward to next '<' */
            html_walker++;
        }
    } while (html_walker && *html_walker);
    return 0;
}

如何在html中搜索字符串模式，用C编码？

1 个答案: