Question

我在C中编写了一个程序，用于计算文件中每个单词的所有单词出现次数，并对它们进行排序，以便将最常出现的单词显示在最少出现的单词中。但是，我需要使用pthread创建多个线程，具体取决于在命令行中作为参数输入的数字。该文件需要分成输入的线程数。例如，假设在命令行中输入4作为参数。然后，该文件需要分成四个部分，每个部分使用一个新线程。然后这四个部分需要重新连接在一起。我的C不是很好，我迷失了如何做到这一点。任何人都可以帮忙吗？一个例子就是很棒。

到目前为止，这是我的代码：

    int main(int argc, char **argv) {
       struct stat fileStat;
       FILE *out;
       char *address;
       int size, res, file, num_threads;

       list_t *words = (list_t *)malloc(sizeof(list_t)); 

       res = access(argv[1], F_OK);
       if (result != 0) {
          exit(1);
       }

       stat(argv[1], &fileStat);

       // Check if a file.
       if (S_ISREG(fileStat.st_mode)) {
          file = open(argv[1], O_RDONLY);
          if (file < 0)
             exit(1);
         // Check the total size of the file
         size = fileStat.st_size; 
         num_threads = atoi(argv[2]); 
         if ((addr = mmap(0, size, PROT_READ, MAP_SHARED , file, 0)) == (void *) -1) {
            exit(1);
         }
         munmap(addr, size);
         close(file);
      } else {
         exit(1);
      }

Answer 1

多个线程可以安全地读取源文件。写作就是你遇到问题。

我的建议（没有真正理解要求）是：

启动时确定文件大小
计算值大小/ threadcount
假设文件大小为4k，每个线程的值大约为1k
搜索文件1块大小，读取单个字节，直到找到字分隔符
此位置是主题1区域的结尾和主题2的开始
寻找第二个和第三个块大小并执行相同的操作
此时，您有每个线程的文件开始和结束位置
启动每个帖子，并将他们负责的职位传递给
使用互斥技术制作散列表（或任何计算您正在使用的单词的方法）线程安全，并让每个线程添加到找到的任何单词的计数
完成所有主题后，您就会有列表

Answer 2

这里的想法是将工作分成多个线程并在之后连接部件，执行相同的操作要快得多。所以你需要：

将工作分成许多部分而不浪费太多时间
找到一种轻松加入工作的方法
解决因划分工作而导致的边界问题

第一部分很简单。只需在线程之间平均分配数据。

第二部分也很简单。只是总结结果。

棘手的部分是第3部分。在你的情况下，你可能最终得到一个分为两个不同线程的单词。因此，为避免计算“半字”，您必须为每个线程的第一个/最后一个字维护一个单独的记录。然后，当你得到所有结果时，你可以得到线程N的最后一个单词，并将它与线程N + 1的第一个单词连接起来，然后只将该单词添加到计数中。很可能如果一个分隔符（空格，输入，......）是一个线程找到的第一个/最后一个字符，那么你的第一个/最后一个字将是空的。

在伪代码中：

def main:
    size = filesize
    ptr = mmap(file)
    num_threads = 4

    for i in range(1, num_threads):
        new_thread(exec = count_words,
                   start = ptr + i * size / num_threads,
                   length = size / num_threads)

    wait_for_result(all_threads)
    join_the_results

def count_words(start, length):
    # Count words as if it was an entire file
    # But store separatelly the first/last word if
    # the segment does not start/ends with an word
    # separator(" ", ".", ",", "\n", etc...)
    return (count_of_words, first_word, last_word)

这与MapReduce背后的想法相同。

Answer 3

这不是一个完美的逻辑代码。我用过C ++。如果您对C非常特别，可以使用POSIX线程而不是std :: thread。我还把整个文件大小分成了线程数。您将不得不在最后一个线程本身处理最后一块数据（剩余的除以线程数）。我还没有做到这一点。

另一点是我从线程获取返回值的方式。截至目前，我正在将其保存到全局数组中。 C ++ 11支持检索返回值 - C++: Simple return value from std::thread?

#include <iostream>
#include <fstream>
#include <thread>
#include <mutex>
using namespace std;

#define NO_OF_THREADS 4

int countArray[100];
std::mutex g_pages_mutex;

int trimWhiteSpaces(char *arr, int start, int len)
{
    int i = 0;
    for(; i < len; i++)
    {
        char c = arr[i];
        if(c == ' ')
        {
            continue;
        }
        else
            break;
    }

    return i;
}

void getWordCount(char *arr, int len, int ID)
{

    int count = 0;
    bool isSpace = false;
    int i = 0;
    i = i + trimWhiteSpaces(arr, i, len);
    for(; i < len; i++)
    {
        char c = arr[i];
        if(c == ' ')
        {
            i = i + trimWhiteSpaces(&arr[i], i, len) - 1;
            //printf("Found space");
            isSpace = true;
            count++;
        }
        else
        {
            isSpace = false;
        }
    }

    if(isSpace)
        count = count - 1;
    count = count + 1;
    g_pages_mutex.lock();
    cout << "MYCOUNT:" << count << "\n";
    countArray[ID] = count;
    g_pages_mutex.unlock();
}

int main(int argc, const char * argv[])
{
    char fileData[5000];
    std::thread threadIDs[100];
    int noOfThreads = NO_OF_THREADS;
    char *filePath = "/Users/abc/Desktop/test.txt";
    int read_sz = 0;
    int decrements = 0;
    bool previousNotEndsInSpace = false;

    std::ifstream is(filePath, std::ifstream::ate | std::ifstream::binary);
    int fileSize = is.tellg();
    int bulkSize = fileSize / NO_OF_THREADS;
    is.seekg(0);


    for(int iter = 0; iter < NO_OF_THREADS; iter++)
    {
        int old_read_sz = read_sz;
        is.read(fileData, bulkSize);
        read_sz = is.tellg();
        fileData[read_sz - old_read_sz] = '\0';
        if(read_sz > 0)
        {
            cout << " data size so far: " << read_sz << "\n";
            cout << fileData << endl;
            if(previousNotEndsInSpace && fileData[0] != ' ')
            {
                decrements = decrements + 1;
            }

            if(fileData[read_sz - 1] != ' ')
            {
                previousNotEndsInSpace = true;
            }
            else
            {
                previousNotEndsInSpace = false;
            }
            //getWordCount(fileData, strlen(fileData), iter);
            threadIDs[iter] = std::thread(getWordCount, fileData, strlen(fileData), iter);
        }
    }

    for(int iter = 0; iter < NO_OF_THREADS; iter++)
    {
        threadIDs[iter].join();
    }

    int totalCount = 0;
    for(int iter = 0; iter < NO_OF_THREADS; iter++)
    {
        cout << "COUNT: " << countArray[iter] << "\n";
        totalCount = totalCount + countArray[iter];
    }

    cout << "TOTAL: " << totalCount - decrements << "\n";
    return 0;
}

如何在C中使用pthread来计算单词出现次数？

3 个答案: