Skip to content

massive RSS memory usage

When using serd to parse RDF files, I observed that it uses a considerable amount of RAM even if I do nothing with the parsed data. It is ~10 GB to read a 15 GB file. I extracted a minimum example that reports RSS memory usage throughout execution and reports pmap on_end. Output:

file: /home/me/Downloads/watdiv.100M.nt
file size: 15233470 kB
before reading: 2688 kB RSS
after reading: 10221352 kB RSS
pmap 115703 
... # appended as file pmap_after_reading for better readability
after freeing reader: 2812 kB RSS

pmap_after_reading

I have tested with serd version v.30.{2,4,8,10}. For v0.30.0 and v0.28.0 WAF did fail on my machine. I tried on Ubuntu 21.10 with gcc-11 and clang-13.

An large RDF file can be found here: https://dsg.uwaterloo.ca/watdiv/#download (e.g. 100M).

Minimal example code:

#include <serd-0/serd/serd.h>

#include <iostream>
#include <vector>

#include <unistd.h>

inline auto get_rss() {
    auto tmp_file = "tmp_3124.txt";
    auto command = std::string{"ps -o rss --no-headers "} + std::to_string(getpid()) + " > " + tmp_file;
    std::system(command.c_str());

    FILE *file = fopen(tmp_file, "r");

    char line[128];
    int i = -1;
    while (fgets(line, 128, file) != nullptr) {
        i = std::stoi(line);
        break;
    }
    fclose(file);
    return i;
}

SerdStatus on_base([[maybe_unused]] void *handle,
                   [[maybe_unused]] const SerdNode *uri) {
    return SERD_SUCCESS;
}

SerdStatus on_prefix([[maybe_unused]] void *handle,
                     [[maybe_unused]] const SerdNode *name,
                     [[maybe_unused]] const SerdNode *uri) {
    return SERD_SUCCESS;
}

SerdStatus on_statement([[maybe_unused]] void *handle,
                        [[maybe_unused]] SerdStatementFlags,
                        [[maybe_unused]] const SerdNode *,
                        [[maybe_unused]] const SerdNode *subject,
                        [[maybe_unused]] const SerdNode *predicate,
                        [[maybe_unused]] const SerdNode *object,
                        [[maybe_unused]] const SerdNode *object_datatype,
                        [[maybe_unused]] const SerdNode *object_lang) {
    return SERD_SUCCESS;
}

SerdStatus on_end([[maybe_unused]] void *handle, [[maybe_unused]] const SerdNode *node) {
    std::cout << "never reached" << std::endl;
    return SERD_SUCCESS;
}

int main(int argc, char *argv[]) {
    // start serd parser

    std::cout << "file: " << argv[1] << std::endl; // get current file pointer

    auto f = fopen(argv[1], "rb");
    fseek(f, 0, SEEK_END); // seek to end of file
    std::cout << "file size: " << ftell(f)/1024 << " kB" << std::endl; // get current file pointer
    fclose(f);

    SerdReader *reader = serd_reader_new(SERD_TURTLE, (void *) nullptr,
                                         nullptr,
                                         reinterpret_cast<SerdBaseSink>(on_base),
                                         reinterpret_cast<SerdPrefixSink>(on_prefix),
                                         reinterpret_cast<SerdStatementSink>(on_statement),
                                         reinterpret_cast<SerdEndSink>(on_end));
    std::cout << "before reading: " << get_rss() << " kB RSS" << std::endl;



    /* Same result with this.
    f = fopen(argv[1], "rb");
    serd_reader_start_stream(reader, f, reinterpret_cast<const uint8_t *>("serd_debug"), false);
    while (serd_reader_read_chunk(reader) == SERD_SUCCESS);
    serd_reader_end_stream(reader);
    */
    serd_reader_read_file(reader, reinterpret_cast<const uint8_t *>(argv[1]));
    std::cout << "after reading: " << get_rss() << " kB RSS" << std::endl;
    std::string cmd = std::string{"pmap "} + std::to_string(getpid());
    std::cout << cmd << std::endl;
    system(cmd.c_str());
    serd_reader_free(reader);
    std::cout << "after freeing reader: " << get_rss() << " kB RSS" << std::endl;
    return 0;
}

Source files: CMakeLists.txt main.cpp

Edited by Alexander Bigerl