Question

我试图在我的Ubuntu x86_64上的分叉进程之间共享一个文本文件：该文件不会非常大，因为只有在文件中没有另一个相同的字符串时才会写入字符串;字符串将是访问过的网站的主机名，因此我假设每个主机名不超过255个字节。

当一个进程转向在共享对象中写入时，它就可以了;一旦所有进程在共享对象中写入，msync应该使写入在磁盘上有效，但创建的mapped.txt文件只包含一个来自arrayString的字符串，即最后一个进程写的字符串在共享对象中。

以下是代码：

#include <stdio.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <sys/mman.h>
#include <unistd.h>
#include <stdlib.h>
#include <semaphore.h>
#include <string.h>

// first forked process will write "first" in file, and so on
const char *arrayString[] = {
    "first", 
    "second",
    "third"
};

int main(void) {

    int index;
    int children = 3;
    const char *filepath = "mapped.txt";
    sem_t *sem;

    sem = sem_open("semaphore", O_CREAT | O_EXCL, 0644, 1);
    sem_unlink("semaphore");
    int fd;
    fd = open(filepath, O_RDWR | O_CREAT, 0644);
    if (fd < 0) {
        perror("open:");
        return EXIT_FAILURE;
    }

    char *data;
    data = (char *)mmap(NULL, getpagesize(), PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
    if (data == MAP_FAILED) {
        close(fd);
        perror("mmap:");
        return EXIT_FAILURE;
    }

    for (index=0; index<children; index++) {
        if (fork() == 0) {
            sem_wait(sem);

            size_t textsize = strlen(arrayString[index])+1;

            if (ftruncate(fd, sizeof(textsize)) == -1) {
                perror("ftruncate:");
                return EXIT_FAILURE;
            }

            for (size_t i = 0; i < textsize; i++) {
                printf("%d Writing character %c at %zu\n", getpid(), arrayString[index][i], i);
                data[i] = arrayString[index][i];
            }

            printf("%d wrote ", getpid());
            for (size_t i = 0; i < textsize; i++) {
                printf("%c", data[i]);
            }
            printf("\n");

            if (msync(data, textsize, MS_SYNC) == -1) {
                perror("Could not sync the file to disk");
            }

            sem_post(sem);
            _exit(EXIT_SUCCESS);
        }
    }
    close(fd);

    return EXIT_SUCCESS;
}

对于三个子进程，这是上面代码的一个可能输出（这很好）：

20373 Writing character s at 0
20373 Writing character e at 1
20373 Writing character c at 2
20373 Writing character o at 3
20373 Writing character n at 4
20373 Writing character d at 5
20373 Writing character  at 6
20373 wrote second
20374 Writing character t at 0
20374 Writing character h at 1
20374 Writing character i at 2
20374 Writing character r at 3
20374 Writing character d at 4
20374 Writing character  at 5
20374 wrote third
20372 Writing character f at 0
20372 Writing character i at 1
20372 Writing character r at 2
20372 Writing character s at 3
20372 Writing character t at 4
20372 Writing character  at 5
20372 wrote first

这里有mapped.txt的内容（这很糟糕）：

first^@^@^@

我期待：

second
third
first

但我得到的只是最后一个进程的字符串，带有那些奇怪的符号。我希望将此文件保留在内存中，但由于I / O速度慢，我尝试使用内存映射。知道为什么我的文件只包含访问共享文件的最后一个进程写的字符串吗？

编辑：我想我明白了，现在似乎工作了：我希望它对某人有所帮助。用g++ -g -o mapthis mapthis.cpp -lrt -pthread编译。请注意，缺少某些错误检查，例如fsync，snprintf和lseek。

#include <stdio.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <sys/mman.h>
#include <unistd.h>
#include <stdlib.h>
#include <semaphore.h>
#include <time.h>
#include <string.h>
#include <sys/types.h>
#include <sys/wait.h>

const char *arrayString[] = {
    "www.facebook.com",
    "www.google.com",
    "www.cnn.com",
    "www.speechrepository.com",
    "www.youtube.com",
    "www.facebook.com",
    "www.google.com",
    "www.cnn.com",
    "www.speechrepository.com",
    "www.youtube.com",
    "www.facebook.com",
    "www.google.com",
    "www.cnn.com",
    "www.speechrepository.com",
    "www.youtube.com"
};

int main(void) {

    int index;
    int children = sizeof(arrayString) / sizeof(char*);;
    const char *filepath = "mapped.txt";
    sem_t *sem;
    char *data;
    struct stat filestats;

    sem = sem_open("semaphore", O_CREAT | O_EXCL, 0644, 1);
    sem_unlink("semaphore");
    int fd;
    fd = open(filepath, O_RDWR | O_CREAT, 0644);
    if (fd < 0) {
        perror("open:");
        return EXIT_FAILURE;
    }

    if (fstat(fd, &filestats) < 0) {
        close(fd);
        perror("fstat:");
        return EXIT_FAILURE;
    }

    data = (char *)mmap(NULL, filestats.st_size ? filestats.st_size : 1, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
    if (data == MAP_FAILED) {
        close(fd);
        perror("first map:");
        return EXIT_FAILURE;
    }

    for (index=0; index<children; index++) {
        sleep(1);
        pid_t pid = fork();
        if (pid == 0) {
            int nw = 0;
            int hostnameSize = 0;
            const size_t origsize = filestats.st_size;
            char *hostPos = NULL;
            char *numPos = NULL;
            char *backslashPos = NULL;
            char tempBuff[64];
            memset((char *)tempBuff, 0, sizeof(tempBuff));
            sem_wait(sem);
            // remap to current file size if it changed
            fstat(fd, &filestats);
            // file empty, just insert
            if (filestats.st_size == 0) {
                nw = snprintf(tempBuff, sizeof(tempBuff), "%s %010lu\n", arrayString[index], (unsigned long)time(NULL));
                write(fd, tempBuff, nw);
                fsync(fd);
            }
            else {
                // file not empty, let's look for string
                hostPos = strstr(data, arrayString[index]);
                if (hostPos) {
                    // string is already inserted, search for offset of number of seconds
                    lseek(fd, hostPos-data, SEEK_SET);
                    numPos = strchr(hostPos, ' ')+1;
                    backslashPos = strchr(numPos, '\n');
                    long unsigned before = atoi(numPos);
                    long unsigned now = (unsigned long)time(NULL);
                    long unsigned difference = now - before;
                    printf("%s visited %ld seconds ago (%ld - %ld)\n", 
                        arrayString[index], difference, now, before);
                    nw = snprintf(tempBuff, backslashPos-hostPos+1, "%s %010lu", arrayString[index], now);
                    write(fd, tempBuff, nw);
                    write(fd, "\n", 1);
                    fsync(fd);
                }
                else {
                    data = (char *)mremap(data, origsize, filestats.st_size, MREMAP_MAYMOVE);
                    if (data == MAP_FAILED) {
                        close(fd);
                        sem_post(sem);
                        perror("mmap:");
                        _exit(EXIT_FAILURE);
                    }
                    lseek(fd, 0, SEEK_END);
                    nw = snprintf(tempBuff, sizeof(tempBuff), "%s %010lu\n", arrayString[index], (unsigned long)time(NULL));
                    write(fd, tempBuff, nw);
                    fsync(fd);
                }
            }
            munmap(data, filestats.st_size);
            close(fd);
            sem_post(sem);
            _exit(EXIT_SUCCESS);
        }
        else if (pid > 0) {
            wait(NULL);
        }
    }
    munmap(data, filestats.st_size);
    close(fd);

    return EXIT_SUCCESS;
}

Answer 1

这一行存在问题：

if (ftruncate(fd, sizeof(textsize)) == -1) {

textsize是size_t，其sizeof只需4或8（32位和64位系统）。看起来你是64位系统，因此在每次写入之前，你无条件地将文件截断为8字节。 “奇怪的符号”就是编辑器显示NUL /零字节的方式。即使你使用ftruncate(fd, textsize)，你仍然会截断到你要编写的字符串，覆盖其他孩子可能写的任何数据;我怀疑你想在这里ftruncate。

对于来自不同进程的连续追加（他们无法共享有关他们正在添加的数据的大小或偏移量的信息），内存映射只是没有意义;你为什么不让他们每个人锁定lseek到文件结尾，然后拨打write？你仍然可以使用内存映射进行重复检查（其中一些没有锁定），它会有点不同。像这样：

int main(void) {
    struct stat filestats;
    int index;
    int children = 3;
    const char *filepath = "mapped.txt";
    sem_t *sem;
    char *data;

    sem = sem_open("semaphore", O_CREAT | O_EXCL, 0644, 1);
    sem_unlink("semaphore");
    int fd;
    fd = open(filepath, O_RDWR | O_CREAT, 0644);
    if (fd < 0) {
        perror("open:");
        return EXIT_FAILURE;
    }

    // Mostly just to ensure it's mappable, we map the current size of the file
    // If the file might already have values, and many child workers won't add
    // to it, this might save some mapping work in the children; you could
    // just map in the children when needed though
    if (fstat(fd, &filestats) != 0) {
        close(fd);
        perror("fstat:");
        return EXIT_FAILURE;
    }
    data = mmap(NULL, filestats.st_size, PROT_READ, MAP_SHARED, fd, 0);
    if (data == MAP_FAILED) {
        close(fd);
        perror("mmap:");
        return EXIT_FAILURE;
    }

    for (index=0; index<children; index++) {
        if (fork() == 0) {
            const size_t origsize = filestats.st_size;
            sem_wait(sem);

            // remap to current file size if it changed
            // If you're not on Linux, you'd just have to mmap from scratch
            // since mremap isn't standard
            fstat(fd, &filestats);
            if (origsize != filestats.st_size) {
                data = mremap(data, origsize, filestats.st_size, MREMAP_MAYMOVE);
                if (data == MAP_FAILED) {
                    close(fd);
                    sem_post(sem);
                    perror("mmap:");
                    _exit(EXIT_FAILURE);
                }
            }

            // Not safe to use strstr since mapping might not end with NUL byte
            // You'd need to workaround this, or implement a your own memstr-like function
            if (!memstr(data, arrayString[index])) {
                // Move fd to end of file, so we append new data
                lseek(fd, 0, SEEK_END);
                write(fd, arrayString[index], strlen(arrayString[index]));
                write(fd, "\n", 1);
                fsync(fd);
            }
            munmap(data, filestats.st_size);
            close(fd);
            sem_post(sem);
            _exit(EXIT_SUCCESS);
        }
    }
    munmap(data, filestats.st_size);
    close(fd);

    return EXIT_SUCCESS;
}

我引用的memstr需要手工实现（或者你需要做一些可怕的事情，比如确保文件末尾总是有一个NUL字节，这样你就可以使用{{ 1}}在它上面）;您可以获得有关here的一些提示。

Answer 2

您正在写入文件偏移0处的所有字符串，每个字符串都在前一个字符串的顶部。循环的核心应该是

struct stat status;
fstat(fd, &status);
size_t cursize = status.st_size;
ftruncate(fd, cursize + textsize);
for (size_t i = 0; i < textsize; i++) {
    data[cursize + i] = arrayString[index][i];
}

使用open（）和mmap（）在进程之间共享文本文件

2 个答案: