在C语言中读取.csv文件

54

我有一个 .csv 文件:

lp;imie;nazwisko;ulica;numer;kod;miejscowosc;telefon;email;data_ur
1;Jan;Kowalski;ul. Nowa;1a;11-234;Budry;123-123-456;jan@go.xxx;1980.05.13
2;Jerzy;Nowak;ul. Konopnicka;13a/3;00-900;Lichowice;(55)333-44-55;jer@wu.to;1990.03.23

我需要用C语言读取这个。我已经有一些代码,但只是用于连接。


4
C CSV 解析器:http://sourceforge.net/projects/cccsvparser C CSV 写入器:http://sourceforge.net/projects/cccsvwriter - SomethingSomething
3
请[编辑]您的问题,以展示您已有的代码(http://whathaveyoutried.com)。您应该至少包括一个代码概述(最好是[mcve]),说明您遇到的问题,然后我们可以尝试解决具体问题。您还应该阅读[ask]。 - Toby Speight
快速且有大量示例:https://github.com/liquidaty/zsv - mwag
示例输入数据由分号分隔。CSV - “CSV”一词还表示几种密切相关的分隔符分隔格式,这些格式使用其他字段分隔符,例如分号。 - Peter Mortensen
6个回答

73

希望这可以让您有所启发

http://ideone.com/l23He上实时查看它(使用标准输入)

#include <stdio.h>
#include <stdlib.h>
#include <string.h>

const char* getfield(char* line, int num)
{
    const char* tok;
    for (tok = strtok(line, ";");
            tok && *tok;
            tok = strtok(NULL, ";\n"))
    {
        if (!--num)
            return tok;
    }
    return NULL;
}

int main()
{
    FILE* stream = fopen("input", "r");

    char line[1024];
    while (fgets(line, 1024, stream))
    {
        char* tmp = strdup(line);
        printf("Field 3 would be %s\n", getfield(tmp, 3));
        // NOTE strtok clobbers tmp
        free(tmp);
    }
}

输出:

Field 3 would be nazwisko
Field 3 would be Kowalski
Field 3 would be Nowak

10
由于strtok无法处理空节点,如何处理类似于“"A1,B2,C3,,F5,G6"这样的输入行? 我正在使用strchrstrcpy`的组合,但无法获取'G6'的值。 - ProfessionalAmateur
2
@ProfessionalAmateur 我会使用C++。请参考c++ answers about csv - sehe
3
我们称它们为“tokens”,而不是“nodes”。 - Lightness Races in Orbit
1
“Tokens” 是正确的术语,花太多时间在 XML 上了。 - ProfessionalAmateur
2
这里有一个我使用的简单函数。请查看 zstrtok() 函数。https://github.com/fnoyanisi/zString - fnisi
显示剩余13条评论

11
以下代码是用纯C语言编写的,可以处理空格。 它只分配一次内存,因此每个处理过的行只需要一个free()函数。 http://ideone.com/mSCgPM
/* Tiny CSV Reader */
/* Copyright (C) 2015, Deligiannidis Konstantinos

This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program.  If not, see <http://w...content-available-to-author-only...u.org/licenses/>.  */


#include <stdio.h>
#include <string.h>
#include <stdlib.h>


/* For more that 100 columns or lines (when delimiter = \n), minor modifications are needed. */
int getcols( const char * const line, const char * const delim, char ***out_storage )

{
const char *start_ptr, *end_ptr, *iter;
char **out;
int i;                                          //For "for" loops in the old c style.
int tokens_found = 1, delim_size, line_size;    //Calculate "line_size" indirectly, without strlen() call.
int start_idx[100], end_idx[100];   //Store the indexes of tokens. Example "Power;": loc('P')=1, loc(';')=6
//Change 100 with MAX_TOKENS or use malloc() for more than 100 tokens. Example: "b1;b2;b3;...;b200"

if ( *out_storage != NULL )                 return -4;  //This SHOULD be NULL: Not Already Allocated
if ( !line || !delim )                      return -1;  //NULL pointers Rejected Here
if ( (delim_size = strlen( delim )) == 0 )  return -2;  //Delimiter not provided

start_ptr = line;   //Start visiting input. We will distinguish tokens in a single pass, for good performance.
                    //Then we are allocating one unified memory region & doing one memory copy.
while ( ( end_ptr = strstr( start_ptr, delim ) ) ) {

    start_idx[ tokens_found -1 ] = start_ptr - line;    //Store the Index of current token
    end_idx[ tokens_found - 1 ] = end_ptr - line;       //Store Index of first character that will be replaced with
                                                        //'\0'. Example: "arg1||arg2||end" -> "arg1\0|arg2\0|end"
    tokens_found++;                                     //Accumulate the count of tokens.
    start_ptr = end_ptr + delim_size;                   //Set pointer to the next c-string within the line
}

for ( iter = start_ptr; (*iter!='\0') ; iter++ );

start_idx[ tokens_found -1 ] = start_ptr - line;    //Store the Index of current token: of last token here.
end_idx[ tokens_found -1 ] = iter - line;           //and the last element that will be replaced with \0

line_size = iter - line;    //Saving CPU cycles: Indirectly Count the size of *line without using strlen();

int size_ptr_region = (1 + tokens_found)*sizeof( char* );   //The size to store pointers to c-strings + 1 (*NULL).
out = (char**) malloc( size_ptr_region + ( line_size + 1 ) + 5 );   //Fit everything there...it is all memory.
//It reserves a contiguous space for both (char**) pointers AND string region. 5 Bytes for "Out of Range" tests.
*out_storage = out;     //Update the char** pointer of the caller function.

//"Out of Range" TEST. Verify that the extra reserved characters will not be changed. Assign Some Values.
//char *extra_chars = (char*) out + size_ptr_region + ( line_size + 1 );
//extra_chars[0] = 1; extra_chars[1] = 2; extra_chars[2] = 3; extra_chars[3] = 4; extra_chars[4] = 5;

for ( i = 0; i < tokens_found; i++ )    //Assign adresses first part of the allocated memory pointers that point to
    out[ i ] = (char*) out + size_ptr_region + start_idx[ i ];  //the second part of the memory, reserved for Data.
out[ tokens_found ] = (char*) NULL; //[ ptr1, ptr2, ... , ptrN, (char*) NULL, ... ]: We just added the (char*) NULL.
                                                    //Now assign the Data: c-strings. (\0 terminated strings):
char *str_region = (char*) out + size_ptr_region;   //Region inside allocated memory which contains the String Data.
memcpy( str_region, line, line_size );   //Copy input with delimiter characters: They will be replaced with \0.

//Now we should replace: "arg1||arg2||arg3" with "arg1\0|arg2\0|arg3". Don't worry for characters after '\0'
//They are not used in standard c lbraries.
for( i = 0; i < tokens_found; i++) str_region[ end_idx[ i ] ] = '\0';

//"Out of Range" TEST. Wait until Assigned Values are Printed back.
//for ( int i=0; i < 5; i++ ) printf("c=%x ", extra_chars[i] ); printf("\n");

// *out memory should now contain (example data):
//[ ptr1, ptr2,...,ptrN, (char*) NULL, "token1\0", "token2\0",...,"tokenN\0", 5 bytes for tests ]
//   |__________________________________^           ^              ^             ^
//          |_______________________________________|              |             |
//                   |_____________________________________________|      These 5 Bytes should be intact.

return tokens_found;
}


int main()

{

char in_line[] = "Arg1;;Th;s is not Del;m;ter;;Arg3;;;;Final";
char delim[] = ";;";
char **columns;
int i;

printf("Example1:\n");
columns = NULL; //Should be NULL to indicate that it is not assigned to allocated memory. Otherwise return -4;

int cols_found = getcols( in_line, delim, &columns);
for ( i = 0; i < cols_found; i++ ) printf("Column[ %d ] = %s\n", i, columns[ i ] );  //<- (1st way).
// (2nd way) // for ( i = 0; columns[ i ]; i++) printf("start_idx[ %d ] = %s\n", i, columns[ i ] );

free( columns );    //Release the Single Contiguous Memory Space.
columns = NULL;     //Pointer = NULL to indicate it does not reserve space and that is ready for the next malloc().

printf("\n\nExample2, Nested:\n\n");

char example_file[] = "ID;Day;Month;Year;Telephone;email;Date of registration\n"
        "1;Sunday;january;2009;123-124-456;jitter@go.xyz;2015-05-13\n"
        "2;Monday;March;2011;(+30)333-22-55;buffer@wl.it;2009-05-23";

char **rows;
int j;

rows = NULL; //getcols() requires it to be NULL. (Avoid dangling pointers, leaks e.t.c).

getcols( example_file, "\n", &rows);
for ( i = 0; rows[ i ]; i++) {
    {
        printf("Line[ %d ] = %s\n", i, rows[ i ] );
        char **columnX = NULL;
        getcols( rows[ i ], ";", &columnX);
        for ( j = 0; columnX[ j ]; j++) printf("  Col[ %d ] = %s\n", j, columnX[ j ] );
        free( columnX );
    }
}

free( rows );
rows = NULL;

return 0;
}

4
使用fscanf读取文件,直到遇到';\n为止,然后使用fscanf(f, "%*c")跳过它。
int main()
{
    char str[128];
    int result;
    FILE* f = fopen("test.txt", "r");

    /*...*/

    do {
        result = fscanf(f, "%127[^;\n]", str);

        if(result == 0)
        {
            result = fscanf(f, "%*c");
        }
        else
        {
            //Put here whatever you want to do with your value.
            printf("%s\n", str);
        }

    } while(result != EOF);

    return 0;
}

3

以下是一个完整的示例,该示例将原始输入缓冲区中的字段保留为以NULL结尾的字符串,并通过char指针数组提供对其的访问。CSV处理器已经确认可用于包含“双引号”的字段,忽略其中的任何分隔符字符。

#include <stdio.h>
#include <stdlib.h>
#include <string.h>

// adjust BUFFER_SIZE to suit longest line 
#define BUFFER_SIZE 1024 * 1024
#define NUM_FIELDS 10
#define MAXERRS 5
#define RET_OK 0
#define RET_FAIL 1
#define FALSE 0
#define TRUE 1

// char* array will point to fields
char *pFields[NUM_FIELDS];
// field offsets into pFields array:
#define LP          0
#define IMIE        1
#define NAZWISKo    2
#define ULICA       3
#define NUMER       4
#define KOD         5
#define MIEJSCOw    6
#define TELEFON     7
#define EMAIL       8
#define DATA_UR     9

long loadFile(FILE *pFile, long *errcount);
static int  loadValues(char *line, long lineno);
static char delim;

long loadFile(FILE *pFile, long *errcount){

    char sInputBuf [BUFFER_SIZE];
    long lineno = 0L;

    if(pFile == NULL)
        return RET_FAIL;

    while (!feof(pFile)) {

        // load line into static buffer
        if(fgets(sInputBuf, BUFFER_SIZE-1, pFile)==NULL)
            break;

        // skip first line (headers)
        if(++lineno==1)
            continue;

        // jump over empty lines
        if(strlen(sInputBuf)==0)
            continue;
        // set pFields array pointers to null-terminated string fields in sInputBuf
        if(loadValues(sInputBuf,lineno)==RET_FAIL){
           (*errcount)++;
            if(*errcount > MAXERRS)
                break;
        } else {    
            // On return pFields array pointers point to loaded fields ready for load into DB or whatever
            // Fields can be accessed via pFields, e.g.
            printf("lp=%s, imie=%s, data_ur=%s\n", pFields[LP], pFields[IMIE], pFields[DATA_UR]);
        }
    }
    return lineno;
}


static int  loadValues(char *line, long lineno){
    if(line == NULL)
        return RET_FAIL;

    // chop of last char of input if it is a CR or LF (e.g.Windows file loading in Unix env.)
    // can be removed if sure fgets has removed both CR and LF from end of line
    if(*(line + strlen(line)-1) == '\r' || *(line + strlen(line)-1) == '\n')
        *(line + strlen(line)-1) = '\0';
    if(*(line + strlen(line)-1) == '\r' || *(line + strlen(line)-1 )== '\n')
        *(line + strlen(line)-1) = '\0';

    char *cptr = line;
    int fld = 0;
    int inquote = FALSE;
    char ch;

    pFields[fld]=cptr;
    while((ch=*cptr) != '\0' && fld < NUM_FIELDS){
        if(ch == '"') {
            if(! inquote)
                pFields[fld]=cptr+1;
            else {
                *cptr = '\0';               // zero out " and jump over it
            }
            inquote = ! inquote;
        } else if(ch == delim && ! inquote){
            *cptr = '\0';                   // end of field, null terminate it
            pFields[++fld]=cptr+1;
        }
        cptr++;
    }   
    if(fld > NUM_FIELDS-1){
        fprintf(stderr, "Expected field count (%d) exceeded on line %ld\n", NUM_FIELDS, lineno);
        return RET_FAIL;
    } else if (fld < NUM_FIELDS-1){
        fprintf(stderr, "Expected field count (%d) not reached on line %ld\n", NUM_FIELDS, lineno);
        return RET_FAIL;    
    }
    return RET_OK;
}

int main(int argc, char **argv)
{
   FILE *fp;
   long errcount = 0L;
   long lines = 0L;

   if(argc!=3){
       printf("Usage: %s csvfilepath delimiter\n", basename(argv[0]));
       return (RET_FAIL);
   }   
   if((delim=argv[2][0])=='\0'){
       fprintf(stderr,"delimiter must be specified\n");
       return (RET_FAIL);
   }
   fp = fopen(argv[1] , "r");
   if(fp == NULL) {
      fprintf(stderr,"Error opening file: %d\n",errno);
      return(RET_FAIL);
   }
   lines=loadFile(fp,&errcount);
   fclose(fp);
   printf("Processed %ld lines, encountered %ld error(s)\n", lines, errcount);
   if(errcount>0)
        return(RET_FAIL);
    return(RET_OK); 
}

2
被接受的答案将把这个由4个元素组成的CSV视为6个元素:QA-Q000630115728222,QA-A0926511569122067,"In 1687 John Phillips, Miltons nephew, produced a Don Quixote made English.",2017-03-07T00:00:00.000Z Gus Gator的例子将把它视为正确的4个元素。 - Adam Howell

2
这段代码相当简单,但很有效。它可以解析带括号的逗号分隔文件。你可以轻松修改它以适应你的需求。
#include <stdio.h>
#include <stdlib.h>
#include <string.h>


int main(int argc, char *argv[])
{
  // argv[1] path to csv file
  // argv[2] number of lines to skip
  // argv[3] length of longest value (in characters)

  FILE *pfinput;
  unsigned int nSkipLines, currentLine, lenLongestValue;
  char *pTempValHolder;
  int c;
  unsigned int vcpm; // Value character marker
  int QuotationOnOff; // 0 - off, 1 - on

  nSkipLines = atoi(argv[2]);
  lenLongestValue = atoi(argv[3]);

  pTempValHolder = (char*)malloc(lenLongestValue);

  if(pfinput = fopen(argv[1], "r")) {

    rewind(pfinput);

    currentLine = 1;
    vcpm = 0;
    QuotationOnOff = 0;

    // currentLine > nSkipLines condition
    // skips / ignores first argv[2] lines
    while((c = fgetc(pfinput)) != EOF)
    {
       switch(c)
       {
          case ',':
            if(!QuotationOnOff && currentLine > nSkipLines)
            {
              pTempValHolder[vcpm] = '\0';
              printf("%s,", pTempValHolder);
              vcpm = 0;
            }
            break;

          case '\n':
            if(currentLine > nSkipLines)
            {
              pTempValHolder[vcpm] = '\0';
              printf("%s\n", pTempValHolder);
              vcpm = 0;
            }
            currentLine++;
            break;

          case '\"':
            if(currentLine > nSkipLines)
            {
              if(!QuotationOnOff) {
                QuotationOnOff = 1;
                pTempValHolder[vcpm] = c;
                vcpm++;
              } else {
                QuotationOnOff = 0;
                pTempValHolder[vcpm] = c;
                vcpm++;
              }
            }
            break;

          default:
            if(currentLine > nSkipLines)
            {
              pTempValHolder[vcpm] = c;
              vcpm++;
            }
            break;
       }
    }

    fclose(pfinput);
    free(pTempValHolder);
  }

  return 0;
}

2
我想问一下发帖人所说的“带括号”是什么意思,但他已经不是SO的成员了。这段代码非常干净,但据我所知,它是从第N行复制文件末尾的所有内容到输出的复杂方式。 - Jonathan Leffler

0
#include <conio.h>
#include <stdio.h>
#include <string.h>

// Driver Code
int main()
{
    // Substitute the full file path
    // for the string file_path
    FILE* fp = fopen("Movie.csv", "r");
    char *wrds[40];
    if (!fp)
        printf("Can't open file\n");

    else {
        // Here we have taken size of
        // array 1024 you can modify it
        char buffer[1024];

        int row = 0;
        int column = 0;

        while (fgets(buffer, 1024, fp)) {
            column = 0;
            row++;

            // To avoid printing of column
            // names in file can be changed
            // according to need
            if (row == 1)
                continue;

            // Splitting the data
            char* value = strtok(buffer, ", ");

            while (value) {
                // Column 1
                if (column == 0) {
                    printf("Name :");
                }

                // Column 2
                if (column == 1) {
                   printf("\tAccount No. :");
                }

                // Column 3
                if (column == 2) {
                    printf("\tAmount :");
                }

                printf("%s", value);
                wrds[column] = value;
                value = strtok(NULL, ", ");
                column++;
            }

            printf("\n");
        }

        // Close the file
        fclose(fp);
   }
    getchar();
    return 0;
}

网页内容由stack overflow 提供, 点击上面的
可以查看英文原文,
原文链接