我需要递归列出C编程中的所有目录和文件。我已研究了FTW,但它不包含在我正在使用的两个操作系统(Fedora和Minix)中。我已经读了几个小时了,开始感到非常头痛。
如果有人知道我可以查看的代码片段,那太棒了,或者如果有人可以给我良好的指导,我将非常感激。
我需要递归列出C编程中的所有目录和文件。我已研究了FTW,但它不包含在我正在使用的两个操作系统(Fedora和Minix)中。我已经读了几个小时了,开始感到非常头痛。
如果有人知道我可以查看的代码片段,那太棒了,或者如果有人可以给我良好的指导,我将非常感激。
为什么每个人都要一遍又一遍地重新发明轮子呢?
POSIX.1-2008标准化了nftw()
函数,这个函数也在Single Unix规范v4(SuSv4)中定义,并且可用于Linux(glibc,man 3 nftw
),OS X和大多数当前的BSD变种。它并不是新功能。
天真的基于 opendir()
/ readdir()
/ closedir()
的实现几乎从不处理目录或文件在树遍历期间移动,重命名或删除的情况,而nftw()
应该能够优雅地处理它们。
例如,考虑以下C程序,它列出以当前工作目录开头的目录树,或者列出命令行上命名的每个目录的目录树,或只列出命令行上命名的文件:
/* We want POSIX.1-2008 + XSI, i.e. SuSv4, features */
#define _XOPEN_SOURCE 700
/* Added on 2017-06-25:
If the C library can support 64-bit file sizes
and offsets, using the standard names,
these defines tell the C library to do so. */
#define _LARGEFILE64_SOURCE
#define _FILE_OFFSET_BITS 64
#include <stdlib.h>
#include <unistd.h>
#include <ftw.h>
#include <time.h>
#include <stdio.h>
#include <string.h>
#include <errno.h>
/* POSIX.1 says each process has at least 20 file descriptors.
* Three of those belong to the standard streams.
* Here, we use a conservative estimate of 15 available;
* assuming we use at most two for other uses in this program,
* we should never run into any problems.
* Most trees are shallower than that, so it is efficient.
* Deeper trees are traversed fine, just a bit slower.
* (Linux allows typically hundreds to thousands of open files,
* so you'll probably never see any issues even if you used
* a much higher value, say a couple of hundred, but
* 15 is a safe, reasonable value.)
*/
#ifndef USE_FDS
#define USE_FDS 15
#endif
int print_entry(const char *filepath, const struct stat *info,
const int typeflag, struct FTW *pathinfo)
{
/* const char *const filename = filepath + pathinfo->base; */
const double bytes = (double)info->st_size; /* Not exact if large! */
struct tm mtime;
localtime_r(&(info->st_mtime), &mtime);
printf("%04d-%02d-%02d %02d:%02d:%02d",
mtime.tm_year+1900, mtime.tm_mon+1, mtime.tm_mday,
mtime.tm_hour, mtime.tm_min, mtime.tm_sec);
if (bytes >= 1099511627776.0)
printf(" %9.3f TiB", bytes / 1099511627776.0);
else
if (bytes >= 1073741824.0)
printf(" %9.3f GiB", bytes / 1073741824.0);
else
if (bytes >= 1048576.0)
printf(" %9.3f MiB", bytes / 1048576.0);
else
if (bytes >= 1024.0)
printf(" %9.3f KiB", bytes / 1024.0);
else
printf(" %9.0f B ", bytes);
if (typeflag == FTW_SL) {
char *target;
size_t maxlen = 1023;
ssize_t len;
while (1) {
target = malloc(maxlen + 1);
if (target == NULL)
return ENOMEM;
len = readlink(filepath, target, maxlen);
if (len == (ssize_t)-1) {
const int saved_errno = errno;
free(target);
return saved_errno;
}
if (len >= (ssize_t)maxlen) {
free(target);
maxlen += 1024;
continue;
}
target[len] = '\0';
break;
}
printf(" %s -> %s\n", filepath, target);
free(target);
} else
if (typeflag == FTW_SLN)
printf(" %s (dangling symlink)\n", filepath);
else
if (typeflag == FTW_F)
printf(" %s\n", filepath);
else
if (typeflag == FTW_D || typeflag == FTW_DP)
printf(" %s/\n", filepath);
else
if (typeflag == FTW_DNR)
printf(" %s/ (unreadable)\n", filepath);
else
printf(" %s (unknown)\n", filepath);
return 0;
}
int print_directory_tree(const char *const dirpath)
{
int result;
/* Invalid directory path? */
if (dirpath == NULL || *dirpath == '\0')
return errno = EINVAL;
result = nftw(dirpath, print_entry, USE_FDS, FTW_PHYS);
if (result >= 0)
errno = result;
return errno;
}
int main(int argc, char *argv[])
{
int arg;
if (argc < 2) {
if (print_directory_tree(".")) {
fprintf(stderr, "%s.\n", strerror(errno));
return EXIT_FAILURE;
}
} else {
for (arg = 1; arg < argc; arg++) {
if (print_directory_tree(argv[arg])) {
fprintf(stderr, "%s.\n", strerror(errno));
return EXIT_FAILURE;
}
}
}
return EXIT_SUCCESS;
}
上述大部分代码都在print_entry()
中。它的任务是打印出每个目录条目。在print_directory_tree()
中,我们告诉nftw()
对于它看到的每个目录条目都调用它。
上面唯一模糊的细节是决定让nftw()
使用多少个文件描述符。如果您的程序在文件树遍历期间最多使用两个额外的文件描述符(除标准流之外),则已知15个是安全的(在所有具有nftw()
和基本符合POSIX的系统上)。
在Linux中,您可以使用sysconf(_SC_OPEN_MAX)
查找最大打开文件数,并减去您与nftw()
调用并发使用的数量,但我不会费心(除非我知道该实用程序将主要用于病态的深层目录结构)。十五个描述符并不限制树深度; nftw()
只是变慢了(如果遍历一个距离该目录13个目录更深的目录,则可能无法检测到目录中的更改,尽管在系统之间以及C库实现之间的权衡和一般能力来检测更改各不相同)。只使用这样的编译时常量可使代码具有可移植性 - 它不仅应该在Linux上工作,而且应该在Mac OS X和所有当前的BSD变体以及大多数其他不太旧的Unix变体上都能正常工作。
在评论中,Ruslan提到他们不得不切换到nftw64()
,因为他们有需要64位大小/偏移的文件系统条目,并且“普通”版本的nftw()
失败并返回errno == EOVERFLOW
。正确的解决方案不是切换到GLIBC特定的64位函数,而是定义_LARGEFILE64_SOURCE
和_FILE_OFFSET_BITS 64
。这些告诉C库在可能的情况下切换到64位文件大小和偏移,同时使用标准函数(nftw()
,fstat()
等)和类型名称(off_t
等)。
opendir()
、readdir()
、closedir()
的练习,并旨在教导人们仔细思考完整路径名与目录条目之间的区别。在生产代码中,nftw()
是正确的选择——别误会我的意思。但是使用原始函数进行练习也有教育目的。 - Jonathan Leffleropendir()
/readdir()
/closedir()
的典型示例不安全且容易产生竞争,如果在遍历过程中移动涉及到的目录,则可能会做出非常奇怪的事情。这是不好的代码,易碎的。 nftw()
等函数应该能够正确处理所有这些问题。我们可以使用 POSIX.1-2008 的 openat()
/fdopendir()
/readdir()
/closedir()
编写一个安全的目录树遍历器,但我们可以使用的文件描述符数量将受到限制,限制了树的深度。 nftw()
即使在这种情况下也可以安全地避免这种限制。 - Nominal Animal*dir
函数的替代方案。非常感谢。 - Daniel Kamil Kozar_LARGEFILE64_SOURCE
;而应该使用 _FILE_OFFSET_BITS 64
”。不幸的是,似乎仍有许多人在使用旧版本的glibc(2.2.x、2.3.x),这些版本需要那个遗留宏来选择正确的接口。由于它不会对较新的C库造成任何损害(并且由于向后兼容性要求,在未来也不会造成任何损害),我也选择将其包含在内。如果你知道有任何矛盾之处,请让我知道! - Nominal Animal这里是一个递归版本:
#include <unistd.h>
#include <sys/types.h>
#include <dirent.h>
#include <stdio.h>
#include <string.h>
void listdir(const char *name, int indent)
{
DIR *dir;
struct dirent *entry;
if (!(dir = opendir(name)))
return;
while ((entry = readdir(dir)) != NULL) {
if (entry->d_type == DT_DIR) {
char path[1024];
if (strcmp(entry->d_name, ".") == 0 || strcmp(entry->d_name, "..") == 0)
continue;
snprintf(path, sizeof(path), "%s/%s", name, entry->d_name);
printf("%*s[%s]\n", indent, "", entry->d_name);
listdir(path, indent + 2);
} else {
printf("%*s- %s\n", indent, "", entry->d_name);
}
}
closedir(dir);
}
int main(void) {
listdir(".", 0);
return 0;
}
int is_directory_we_want_to_list(const char *parent, char *name) {
struct stat st_buf;
if (!strcmp(".", name) || !strcmp("..", name))
return 0;
char *path = alloca(strlen(name) + strlen(parent) + 2);
sprintf(path, "%s/%s", parent, name);
stat(path, &st_buf);
return S_ISDIR(st_buf.st_mode);
}
int list(const char *name) {
DIR *dir = opendir(name);
struct dirent *ent;
while (ent = readdir(dir)) {
char *entry_name = ent->d_name;
printf("%s\n", entry_name);
if (is_directory_we_want_to_list(name, entry_name)) {
// You can consider using alloca instead.
char *next = malloc(strlen(name) + strlen(entry_name) + 2);
sprintf(next, "%s/%s", name, entry_name);
list(next);
free(next);
}
}
closedir(dir);
}
ftw
提供的。list(entry_name)
? - Jon Egelandis_directory_we_want_to_list
。 - Jan#include <dirent.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/types.h>
int main(int argc, char const *argv[]) {
/* print use instruction unless a folder name was given */
if (argc < 2)
fprintf(stderr,
"\nuse:\n"
" %s <directory>\n"
"for example:\n"
" %s ./\n\n",
argv[0], argv[0]),
exit(0);
/*************** a small linked list macro implementation ***************/
typedef struct list_s {
struct list_s *next;
struct list_s *prev;
} list_s;
#define LIST_INIT(name) \
{ .next = &name, .prev = &name }
#define LIST_PUSH(dest, node) \
do { \
(node)->next = (dest)->next; \
(node)->prev = (dest); \
(node)->next->prev = (node); \
(dest)->next = (node); \
} while (0);
#define LIST_POP(list, var) \
if ((list)->next == (list)) { \
var = NULL; \
} else { \
var = (list)->next; \
(list)->next = var->next; \
var->next->prev = var->prev; \
}
/*************** a record (file / folder) item type ***************/
typedef struct record_s {
/* this is a flat processing queue. */
list_s queue;
/* this will list all queued and processed folders (cyclic protection) */
list_s folders;
/* this will list all the completed items (siblings and such) */
list_s list;
/* unique ID */
ino_t ino;
/* name length */
size_t len;
/* name string */
char name[];
} record_s;
/* take a list_s pointer and convert it to the record_s pointer */
#define NODE2RECORD(node, list_name) \
((record_s *)(((uintptr_t)(node)) - \
((uintptr_t) & ((record_s *)0)->list_name)))
/* initializes a new record */
#define RECORD_INIT(name) \
(record_s){.queue = LIST_INIT((name).queue), \
.folders = LIST_INIT((name).folders), \
.list = LIST_INIT((name).list)}
/*************** the actual code ***************/
record_s records = RECORD_INIT(records);
record_s *pos, *item;
list_s *tmp;
DIR *dir;
struct dirent *entry;
/* initialize the root folder record and add it to the queue */
pos = malloc(sizeof(*pos) + strlen(argv[1]) + 2);
*pos = RECORD_INIT(*pos);
pos->len = strlen(argv[1]);
memcpy(pos->name, argv[1], pos->len);
if (pos->name[pos->len - 1] != '/')
pos->name[pos->len++] = '/';
pos->name[pos->len] = 0;
/* push to queue, but also push to list (first item processed) */
LIST_PUSH(&records.queue, &pos->queue);
LIST_PUSH(&records.list, &pos->list);
/* as long as the queue has items to be processed, do so */
while (records.queue.next != &records.queue) {
/* pop queued item */
LIST_POP(&records.queue, tmp);
/* collect record to process */
pos = NODE2RECORD(tmp, queue);
/* add record to the processed folder list */
LIST_PUSH(&records.folders, &pos->folders);
/* process the folder and add all folder data to current list */
dir = opendir(pos->name);
if (!dir)
continue;
while ((entry = readdir(dir)) != NULL) {
/* create new item, copying it's path data and unique ID */
item = malloc(sizeof(*item) + pos->len + entry->d_namlen + 2);
*item = RECORD_INIT(*item);
item->len = pos->len + entry->d_namlen;
memcpy(item->name, pos->name, pos->len);
memcpy(item->name + pos->len, entry->d_name, entry->d_namlen);
item->name[item->len] = 0;
item->ino = entry->d_ino;
/* add item to the list, right after the `pos` item */
LIST_PUSH(&pos->list, &item->list);
/* unless it's a folder, we're done. */
if (entry->d_type != DT_DIR)
continue;
/* test for '.' and '..' */
if (entry->d_name[0] == '.' &&
(entry->d_name[1] == 0 ||
(entry->d_name[1] == '.' && entry->d_name[2] == 0)))
continue;
/* add folder marker */
item->name[item->len++] = '/';
item->name[item->len] = 0;
/* test for cyclic processing */
list_s *t = records.folders.next;
while (t != &records.folders) {
if (NODE2RECORD(t, folders)->ino == item->ino) {
/* we already processed this folder! */
break; /* this breaks from the small loop... */
}
t = t->next;
}
if (t != &records.folders)
continue; /* if we broke from the small loop, entry is done */
/* item is a new folder, add to queue */
LIST_PUSH(&records.queue, &item->queue);
}
closedir(dir);
}
/*************** Printing the results and cleaning up ***************/
while (records.list.next != &records.list) {
/* pop list item */
LIST_POP(&records.list, tmp);
/* collect and process record */
pos = NODE2RECORD(tmp, list);
fwrite(pos->name, pos->len, 1, stderr);
fwrite("\n", 1, 1, stderr);
/* free node */
free(pos);
}
return 0;
}
编辑
@Stargateur在评论中提到,递归代码在达到打开文件限制之前可能会导致堆栈溢出。
虽然我不知道堆栈溢出有什么好处,但只要进程在调用时不接近文件限制,这种评估可能是正确的。
@Stargateur在评论中提到的另一点是,递归代码的深度受子目录的最大数量限制(ext4文件系统上为64000),并且硬链接极不可能存在(因为Linux / Unix上不允许将硬链接链接到文件夹)。
如果代码在Linux上运行(根据问题,它确实是),那么这是个好消息,因此这个问题并不是真正的问题(除非在macOS或Windows上运行代码)...尽管递归中的64K子文件夹可能会使堆栈崩溃。
话虽如此,非递归选项仍然具有优势,例如能够轻松添加处理的项目数量限制以及能够缓存结果。
P.S.
根据评论,这是一个非递归版本的代码,它不检查循环层次结构。它更快,并且应该足够安全,可以在不允许将硬链接链接到文件夹的Linux机器上使用。
#include <dirent.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/types.h>
int main(int argc, char const *argv[]) {
/* print use instruction unless a folder name was given */
if (argc < 2)
fprintf(stderr,
"\nuse:\n"
" %s <directory>\n"
"for example:\n"
" %s ./\n\n",
argv[0], argv[0]),
exit(0);
/*************** a small linked list macro implementation ***************/
typedef struct list_s {
struct list_s *next;
struct list_s *prev;
} list_s;
#define LIST_INIT(name) \
{ .next = &name, .prev = &name }
#define LIST_PUSH(dest, node) \
do { \
(node)->next = (dest)->next; \
(node)->prev = (dest); \
(node)->next->prev = (node); \
(dest)->next = (node); \
} while (0);
#define LIST_POP(list, var) \
if ((list)->next == (list)) { \
var = NULL; \
} else { \
var = (list)->next; \
(list)->next = var->next; \
var->next->prev = var->prev; \
}
/*************** a record (file / folder) item type ***************/
typedef struct record_s {
/* this is a flat processing queue. */
list_s queue;
/* this will list all the completed items (siblings and such) */
list_s list;
/* unique ID */
ino_t ino;
/* name length */
size_t len;
/* name string */
char name[];
} record_s;
/* take a list_s pointer and convert it to the record_s pointer */
#define NODE2RECORD(node, list_name) \
((record_s *)(((uintptr_t)(node)) - \
((uintptr_t) & ((record_s *)0)->list_name)))
/* initializes a new record */
#define RECORD_INIT(name) \
(record_s){.queue = LIST_INIT((name).queue), .list = LIST_INIT((name).list)}
/*************** the actual code ***************/
record_s records = RECORD_INIT(records);
record_s *pos, *item;
list_s *tmp;
DIR *dir;
struct dirent *entry;
/* initialize the root folder record and add it to the queue */
pos = malloc(sizeof(*pos) + strlen(argv[1]) + 2);
*pos = RECORD_INIT(*pos);
pos->len = strlen(argv[1]);
memcpy(pos->name, argv[1], pos->len);
if (pos->name[pos->len - 1] != '/')
pos->name[pos->len++] = '/';
pos->name[pos->len] = 0;
/* push to queue, but also push to list (first item processed) */
LIST_PUSH(&records.queue, &pos->queue);
LIST_PUSH(&records.list, &pos->list);
/* as long as the queue has items to be processed, do so */
while (records.queue.next != &records.queue) {
/* pop queued item */
LIST_POP(&records.queue, tmp);
/* collect record to process */
pos = NODE2RECORD(tmp, queue);
/* process the folder and add all folder data to current list */
dir = opendir(pos->name);
if (!dir)
continue;
while ((entry = readdir(dir)) != NULL) {
/* create new item, copying it's path data and unique ID */
item = malloc(sizeof(*item) + pos->len + entry->d_namlen + 2);
*item = RECORD_INIT(*item);
item->len = pos->len + entry->d_namlen;
memcpy(item->name, pos->name, pos->len);
memcpy(item->name + pos->len, entry->d_name, entry->d_namlen);
item->name[item->len] = 0;
item->ino = entry->d_ino;
/* add item to the list, right after the `pos` item */
LIST_PUSH(&pos->list, &item->list);
/* unless it's a folder, we're done. */
if (entry->d_type != DT_DIR)
continue;
/* test for '.' and '..' */
if (entry->d_name[0] == '.' &&
(entry->d_name[1] == 0 ||
(entry->d_name[1] == '.' && entry->d_name[2] == 0)))
continue;
/* add folder marker */
item->name[item->len++] = '/';
item->name[item->len] = 0;
/* item is a new folder, add to queue */
LIST_PUSH(&records.queue, &item->queue);
}
closedir(dir);
}
/*************** Printing the results and cleaning up ***************/
while (records.list.next != &records.list) {
/* pop list item */
LIST_POP(&records.list, tmp);
/* collect and process record */
pos = NODE2RECORD(tmp, list);
fwrite(pos->name, pos->len, 1, stderr);
fwrite("\n", 1, 1, stderr);
/* free node */
free(pos);
}
return 0;
}
下面是一个简化版的递归函数,它使用的堆栈空间更少:
#include <errno.h>
#include <stdio.h>
#include <string.h>
#include <sys/types.h>
#include <unistd.h>
#include <dirent.h>
void listdir(char *path, size_t size) {
DIR *dir;
struct dirent *entry;
size_t len = strlen(path);
if (!(dir = opendir(path))) {
fprintf(stderr, "path not found: %s: %s\n",
path, strerror(errno));
return;
}
puts(path);
while ((entry = readdir(dir)) != NULL) {
char *name = entry->d_name;
if (entry->d_type == DT_DIR) {
if (!strcmp(name, ".") || !strcmp(name, ".."))
continue;
if (len + strlen(name) + 2 > size) {
fprintf(stderr, "path too long: %s/%s\n", path, name);
} else {
path[len] = '/';
strcpy(path + len + 1, name);
listdir(path, size);
path[len] = '\0';
}
} else {
printf("%s/%s\n", path, name);
}
}
closedir(dir);
}
int main(void) {
char path[1024] = ".";
listdir(path, sizeof path);
return 0;
}
find .
完全相同。不构建路径名遍历目录树
这是使用文件描述符引用目录的版本,使用 fdopendir()
、fstatat()
和 openat()
遍历目录树而无需构建任何路径名。
这种方法实现更简单,并且在具有深度嵌套目录树的系统上非常有用,其中完整路径名可能超过 PATH_MAX
- 请注意,PATH_MAX
可能不存在。
#define _POSIX_C_SOURCE 200809L
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <dirent.h>
#include <stdio.h>
#include <unistd.h>
#include <string.h>
实际目录树遍历实现:
// the actual walking is done by descriptor, not name
static int myftwImp( int dirfd )
{
DIR *dirp = fdopendir( dirfd );
for ( ;; )
{
struct dirent *dent = readdir( dirp );
if ( NULL == dent ) break;
if ( ( 0 == strcmp( ".", dent->d_name ) ) ||
( 0 == strcmp( "..", dent->d_name ) ) )
{
continue;
}
struct stat sb = { 0 };
fstatat( dirfd, dent->d_name, &sb, 0 );
if ( S_ISDIR( sb.st_mode ) )
{
printf( "dir: %s\n", dent->d_name );
int newdirfd = openat( dirfd, dent->d_name,
O_RDONLY | O_DIRECTORY );
myftwImp( newdirfd );
}
printf( " file: %s\n", dent->d_name );
}
// this will close the descriptor, too
closedir( dirp );
return( 0 );
}
使用目录名称的公共调用:
int myftw( const char *dirname )
{
int dirfd = open( dirname, O_RDONLY | O_DIRECTORY );
myftwImp( dirfd );
return( 0 );
}
使用示例:
int main( int argc, char **argv )
{
int rc = myftw( argv[ 1 ] );
return( rc );
}
为简洁起见,在此没有进行任何错误检查。实际代码应检查所有调用的错误并适当处理。
带有错误检查的完整代码:
#define _POSIX_C_SOURCE 200809L
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <dirent.h>
#include <stdio.h>
#include <unistd.h>
#include <string.h>
static int myftwImp( int dirfd )
{
DIR *dirp = fdopendir( dirfd );
if ( NULL == dirp )
{
return( -1 );
}
int rc = 0;
for ( ;; )
{
struct dirent *dent = readdir( dirp );
if ( NULL == dent )
{
break;
}
if ( 0 == strcmp( ".", dent->d_name ) )
{
continue;
}
if ( 0 == strcmp( "..", dent->d_name ) )
{
continue;
}
struct stat sb = { 0 };
rc = fstatat( dirfd, dent->d_name, &sb, 0 );
if ( 0 != rc )
{
break;
}
if ( S_ISDIR( sb.st_mode ) )
{
int newdirfd = openat( dirfd, dent->d_name, O_RDONLY | O_DIRECTORY );
if ( -1 == newdirfd )
{
rc = -1;
break;
}
printf( "dir: %s\n", dent->d_name );
rc = myftwImp( newdirfd );
if ( 0 != rc )
{
break;
}
}
printf( " file: %s\n", dent->d_name );
}
closedir( dirp );
return( rc );
}
int myftw( const char *dirname )
{
int dirfd = open( dirname, O_RDONLY | O_DIRECTORY );
if ( -1 == dirfd )
{
return( -1 );
}
int rc = myftwImp( dirfd );
return( rc );
}
int main( int argc, char **argv )
{
int rc = myftw( argv[ 1 ] );
return( rc );
}