最高效和内存高效的方法是两次遍历解决方案。在第一次遍历中,您计算所有字符串的总大小,然后分配总内存块。在第二次遍历中,您使用大缓冲区读取所有字符串。
您可以为字符串创建一个指针数组,并计算指针之间的差异以获取字符串的大小。这样,您就可以节省空字节作为结束标记。
以下是一个完整的示例:
#include <stdio.h>
#include <memory.h>
#include <stdlib.h>
struct StringMap
{
char *data;
char **ptr;
long cPos;
};
void initStringMap(StringMap *stringMap, long numberOfStrings, long totalCharacters)
{
stringMap->data = (char*)malloc(sizeof(char)*(totalCharacters+1));
stringMap->ptr = (char**)malloc(sizeof(char*)*(numberOfStrings+2));
memset(stringMap->ptr, 0, sizeof(char*)*(numberOfStrings+1));
stringMap->ptr[0] = stringMap->data;
stringMap->ptr[1] = stringMap->data;
stringMap->cPos = 0;
}
void extendString(StringMap *stringMap, char *str, size_t size)
{
memcpy(stringMap->ptr[stringMap->cPos+1], str, size);
stringMap->ptr[stringMap->cPos+1] += size;
}
void endString(StringMap *stringMap)
{
stringMap->cPos++;
stringMap->ptr[stringMap->cPos+1] = stringMap->ptr[stringMap->cPos];
}
long numberOfStringsInStringMap(StringMap *stringMap)
{
return stringMap->cPos;
}
size_t stringSizeInStringMap(StringMap *stringMap, long index)
{
return stringMap->ptr[index+1] - stringMap->ptr[index];
}
char* stringinStringMap(StringMap *stringMap, long index)
{
return stringMap->ptr[index];
}
void freeStringMap(StringMap *stringMap)
{
free(stringMap->data);
free(stringMap->ptr);
}
int main()
{
long numberOfStrings = 0;
long totalCharacters = 0;
FILE *fd = fopen("/path/to/large/textfile.txt", "r");
int bufferSize = 4096;
char *readBuffer = (char*)malloc(sizeof(char)*bufferSize);
int currentStringLength = 0;
ssize_t readBytes;
while ((readBytes = fread(readBuffer, sizeof(char), bufferSize, fd))>0) {
for (int i = 0; i < readBytes; ++i) {
const char c = readBuffer[i];
if (c != '\n') {
++currentStringLength;
} else {
++numberOfStrings;
totalCharacters += currentStringLength;
currentStringLength = 0;
}
}
}
printf("Found %ld strings with total of %ld bytes\n", numberOfStrings, totalCharacters);
StringMap stringMap;
initStringMap(&stringMap, numberOfStrings, totalCharacters);
rewind(fd);
while ((readBytes = fread(readBuffer, sizeof(char), bufferSize, fd))>0) {
char *stringStart = readBuffer;
for (int i = 0; i < readBytes; ++i) {
const char c = readBuffer[i];
if (c == '\n') {
extendString(&stringMap, stringStart, &readBuffer[i]-stringStart);
endString(&stringMap);
stringStart = &readBuffer[i+1];
}
}
if (stringStart < &readBuffer[readBytes]) {
extendString(&stringMap, stringStart, &readBuffer[readBytes]-stringStart);
}
}
endString(&stringMap);
fclose(fd);
numberOfStrings = numberOfStringsInStringMap(&stringMap);
printf("Number of strings in map: %ld\n", numberOfStrings);
for (long i = 0; i < numberOfStrings; ++i) {
size_t stringSize = stringSizeInStringMap(&stringMap, i);
char *buffer = (char*)malloc(stringSize+1);
memcpy(buffer, stringinStringMap(&stringMap, i), stringSize);
buffer[stringSize-1] = '\0';
printf("string %05ld size=%8ld : %s\n", i, stringSize, buffer);
free(buffer);
}
freeStringMap(&stringMap);
}
这个例子读取一个非常大的文本文件,将其分割成行,并创建一个每行一个字符串的数组。它只需要两个malloc
调用。一个是指针数组,另一个是字符串块。
malloc
。 - RedX