Implement --exclude-kernfs and --exclude-pattern

Eaiser to implement now that we're linking against libc. But exclude pattern matching is extremely slow, so that should really be rewritten with a custom fnmatch implementation. It's exactly as slow as in ncdu 1.x as well, I'm surprised nobody's complained about it yet. And while I'm at it, supporting .gitignore-style patterns would be pretty neat, too.
author: Yorhel <git@yorhel.nl> 2021-05-03 14:41:48 +0200
committer: Yorhel <git@yorhel.nl> 2021-05-03 14:41:50 +0200
commit: a28a0788c3a7a697e73fe07347e9ff036edd64fc (patch)
tree: 5487ff135650143f22904e2b80ea7da52cb3c63a
parent: 826c2fc067a49305042403f05304931f44ad06e0 (diff)
3 files changed, 183 insertions, 81 deletions
diff --git a/src/main.zig b/src/main.zig
index 452d78e..24d1461 100644
--- a/src/main.zig
+++ b/src/main.zig
@@ -12,10 +12,10 @@ pub const allocator = std.heap.c_allocator;
 pub const Config = struct {
     same_fs: bool = true,
     extended: bool = false,
-    exclude_caches: bool = false,
     follow_symlinks: bool = false,
+    exclude_caches: bool = false,
     exclude_kernfs: bool = false,
-    // TODO: exclude patterns
+    exclude_patterns: std.ArrayList([:0]const u8) = std.ArrayList([:0]const u8).init(allocator),
 
     update_delay: u32 = 100,
     si: bool = false,
@@ -31,14 +31,14 @@ pub const Config = struct {
 pub var config = Config{};
 
 // Simple generic argument parser, supports getopt_long() style arguments.
-// T can be any type that has a 'fn next(T) ?[]const u8' method, e.g.:
+// T can be any type that has a 'fn next(T) ?[:0]const u8' method, e.g.:
 //   var args = Args(std.process.ArgIteratorPosix).init(std.process.ArgIteratorPosix.init());
 fn Args(T: anytype) type {
     return struct {
         it: T,
-        short: ?[]const u8 = null, // Remainder after a short option, e.g. -x<stuff> (which may be either more short options or an argument)
+        short: ?[:0]const u8 = null, // Remainder after a short option, e.g. -x<stuff> (which may be either more short options or an argument)
         last: ?[]const u8 = null,
-        last_arg: ?[]const u8 = null, // In the case of --option=<arg>
+        last_arg: ?[:0]const u8 = null, // In the case of --option=<arg>
         shortbuf: [2]u8 = undefined,
         argsep: bool = false,
 
@@ -56,10 +56,10 @@ fn Args(T: anytype) type {
             return Self{ .it = it };
         }
 
-        fn shortopt(self: *Self, s: []const u8) Option {
+        fn shortopt(self: *Self, s: [:0]const u8) Option {
             self.shortbuf[0] = '-';
             self.shortbuf[1] = s[0];
-            self.short = if (s.len > 1) s[1..] else null;
+            self.short = if (s.len > 1) s[1.. :0] else null;
             self.last = &self.shortbuf;
             return .{ .opt = true, .val = &self.shortbuf };
         }
@@ -87,11 +87,11 @@ fn Args(T: anytype) type {
                 self.last = val;
                 return Option{ .opt = true, .val = val };
             }
-            return self.shortopt(val[1..]);
+            return self.shortopt(val[1..:0]);
         }
 
         /// Returns the argument given to the last returned option. Dies with an error if no argument is provided.
-        pub fn arg(self: *Self) []const u8 {
+        pub fn arg(self: *Self) [:0]const u8 {
             if (self.short) |a| {
                 defer self.short = null;
                 return a;
@@ -175,6 +175,19 @@ fn help() noreturn {
     std.process.exit(0);
 }
 
+fn readExcludeFile(path: []const u8) !void {
+    const f = try std.fs.cwd().openFile(path, .{});
+    defer f.close();
+    var rd = std.io.bufferedReader(f.reader()).reader();
+    var buf = std.ArrayList(u8).init(allocator);
+    while (true) {
+        rd.readUntilDelimiterArrayList(&buf, '\n', 4096)
+            catch |e| if (e != error.EndOfStream) return e else if (buf.items.len == 0) break;
+        if (buf.items.len > 0)
+            try config.exclude_patterns.append(try buf.toOwnedSliceSentinel(0));
+    }
+}
+
 pub fn main() anyerror!void {
     // Grab thousands_sep from the current C locale.
     // (We can safely remove this when not linking against libc, it's a somewhat obscure feature)
@@ -206,7 +219,11 @@ pub fn main() anyerror!void {
         else if(opt.is("-r")) config.read_only = true
         else if(opt.is("--si")) config.si = true
         else if(opt.is("-L") or opt.is("--follow-symlinks")) config.follow_symlinks = true
-        else if(opt.is("--exclude-caches")) config.exclude_caches = true
+        else if(opt.is("--exclude")) try config.exclude_patterns.append(args.arg())
+        else if(opt.is("-X") or opt.is("--exclude-from")) {
+            const arg = args.arg();
+            readExcludeFile(arg) catch |e| ui.die("Error reading excludes from {s}: {}.\n", .{ arg, e });
+        } else if(opt.is("--exclude-caches")) config.exclude_caches = true
         else if(opt.is("--exclude-kernfs")) config.exclude_kernfs = true
         else if(opt.is("--confirm-quit")) config.confirm_quit = true
         else if(opt.is("--color")) {
@@ -215,9 +232,12 @@ pub fn main() anyerror!void {
             else if (std.mem.eql(u8, val, "dark")) config.ui_color = .dark
             else ui.die("Unknown --color option: {s}.\n", .{val});
         } else ui.die("Unrecognized option '{s}'.\n", .{opt.val});
-        // TODO: -o, -f, -0, -1, -2, --exclude, -X, --exclude-from
+        // TODO: -o, -f, -0, -1, -2
     }
 
+    if (std.builtin.os.tag != .linux and config.exclude_kernfs)
+        ui.die("The --exclude-kernfs tag is currently only supported on Linux.\n", .{});
+
     try scan.scanRoot(scan_dir orelse ".");
 
     ui.init();
diff --git a/src/scan.zig b/src/scan.zig
index 88b3c10..ee673d8 100644
--- a/src/scan.zig
+++ b/src/scan.zig
@@ -1,6 +1,8 @@
 const std = @import("std");
 const main = @import("main.zig");
 const model = @import("model.zig");
+const c_statfs = @cImport(@cInclude("sys/vfs.h"));
+const c_fnmatch = @cImport(@cInclude("fnmatch.h"));
 
 
 // Concise stat struct for fields we're interested in, with the types used by the model.
@@ -14,108 +16,188 @@ const Stat = struct {
     reg: bool,
     symlink: bool,
     ext: model.Ext,
-};
 
-// Cast any integer type to the target type, clamping the value to the supported maximum if necessary.
-fn castClamp(comptime T: type, x: anytype) T {
-    // (adapted from std.math.cast)
-    if (std.math.maxInt(@TypeOf(x)) > std.math.maxInt(T) and x > std.math.maxInt(T)) {
-        return std.math.maxInt(T);
-    } else if (std.math.minInt(@TypeOf(x)) < std.math.minInt(T) and x < std.math.minInt(T)) {
-        return std.math.minInt(T);
-    } else {
-        return @intCast(T, x);
+    // Cast any integer type to the target type, clamping the value to the supported maximum if necessary.
+    fn castClamp(comptime T: type, x: anytype) T {
+        // (adapted from std.math.cast)
+        if (std.math.maxInt(@TypeOf(x)) > std.math.maxInt(T) and x > std.math.maxInt(T)) {
+            return std.math.maxInt(T);
+        } else if (std.math.minInt(@TypeOf(x)) < std.math.minInt(T) and x < std.math.minInt(T)) {
+            return std.math.minInt(T);
+        } else {
+            return @intCast(T, x);
+        }
     }
-}
 
-// Cast any integer type to the target type, truncating if necessary.
-fn castTruncate(comptime T: type, x: anytype) T {
-    const Ti = @typeInfo(T).Int;
-    const Xi = @typeInfo(@TypeOf(x)).Int;
-    const nx = if (Xi.signedness != Ti.signedness) @bitCast(std.meta.Int(Ti.signedness, Xi.bits), x) else x;
-    return if (Xi.bits > Ti.bits) @truncate(T, nx) else nx;
-}
+    // Cast any integer type to the target type, truncating if necessary.
+    fn castTruncate(comptime T: type, x: anytype) T {
+        const Ti = @typeInfo(T).Int;
+        const Xi = @typeInfo(@TypeOf(x)).Int;
+        const nx = if (Xi.signedness != Ti.signedness) @bitCast(std.meta.Int(Ti.signedness, Xi.bits), x) else x;
+        return if (Xi.bits > Ti.bits) @truncate(T, nx) else nx;
+    }
 
-fn clamp(comptime T: type, comptime field: anytype, x: anytype) std.meta.fieldInfo(T, field).field_type {
-    return castClamp(std.meta.fieldInfo(T, field).field_type, x);
-}
+    fn clamp(comptime T: type, comptime field: anytype, x: anytype) std.meta.fieldInfo(T, field).field_type {
+        return castClamp(std.meta.fieldInfo(T, field).field_type, x);
+    }
 
-fn truncate(comptime T: type, comptime field: anytype, x: anytype) std.meta.fieldInfo(T, field).field_type {
-    return castTruncate(std.meta.fieldInfo(T, field).field_type, x);
-}
+    fn truncate(comptime T: type, comptime field: anytype, x: anytype) std.meta.fieldInfo(T, field).field_type {
+        return castTruncate(std.meta.fieldInfo(T, field).field_type, x);
+    }
 
-fn readStat(parent: std.fs.Dir, name: [:0]const u8, follow: bool) !Stat {
-    const stat = try std.os.fstatatZ(parent.fd, name, if (follow) 0 else std.os.AT_SYMLINK_NOFOLLOW);
-    return Stat{
-        .blocks = clamp(Stat, .blocks, stat.blocks),
-        .size = clamp(Stat, .size, stat.size),
-        .dev = truncate(Stat, .dev, stat.dev),
-        .ino = truncate(Stat, .ino, stat.ino),
-        .nlink = clamp(Stat, .nlink, stat.nlink),
-        .dir = std.os.system.S_ISDIR(stat.mode),
-        .reg = std.os.system.S_ISREG(stat.mode),
-        .symlink = std.os.system.S_ISLNK(stat.mode),
-        .ext = .{
-            .mtime = clamp(model.Ext, .mtime, stat.mtime().tv_sec),
-            .uid = truncate(model.Ext, .uid, stat.uid),
-            .gid = truncate(model.Ext, .gid, stat.gid),
-            .mode = truncate(model.Ext, .mode, stat.mode),
-        },
+    fn read(parent: std.fs.Dir, name: [:0]const u8, follow: bool) !Stat {
+        const stat = try std.os.fstatatZ(parent.fd, name, if (follow) 0 else std.os.AT_SYMLINK_NOFOLLOW);
+        return Stat{
+            .blocks = clamp(Stat, .blocks, stat.blocks),
+            .size = clamp(Stat, .size, stat.size),
+            .dev = truncate(Stat, .dev, stat.dev),
+            .ino = truncate(Stat, .ino, stat.ino),
+            .nlink = clamp(Stat, .nlink, stat.nlink),
+            .dir = std.os.system.S_ISDIR(stat.mode),
+            .reg = std.os.system.S_ISREG(stat.mode),
+            .symlink = std.os.system.S_ISLNK(stat.mode),
+            .ext = .{
+                .mtime = clamp(model.Ext, .mtime, stat.mtime().tv_sec),
+                .uid = truncate(model.Ext, .uid, stat.uid),
+                .gid = truncate(model.Ext, .gid, stat.gid),
+                .mode = truncate(model.Ext, .mode, stat.mode),
+            },
+        };
+    }
+};
+
+var kernfs_cache: std.AutoHashMap(u64,bool) = std.AutoHashMap(u64,bool).init(main.allocator);
+
+// This function only works on Linux
+fn isKernfs(dir: std.fs.Dir, dev: u64) bool {
+    if (kernfs_cache.get(dev)) |e| return e;
+    var buf: c_statfs.struct_statfs = undefined;
+    if (c_statfs.fstatfs(dir.fd, &buf) != 0) return false; // silently ignoring errors isn't too nice.
+    const iskern = switch (buf.f_type) {
+        // These numbers are documented in the Linux 'statfs(2)' man page, so I assume they're stable.
+        0x42494e4d, // BINFMTFS_MAGIC
+        0xcafe4a11, // BPF_FS_MAGIC
+        0x27e0eb, // CGROUP_SUPER_MAGIC
+        0x63677270, // CGROUP2_SUPER_MAGIC
+        0x64626720, // DEBUGFS_MAGIC
+        0x1cd1, // DEVPTS_SUPER_MAGIC
+        0x9fa0, // PROC_SUPER_MAGIC
+        0x6165676c, // PSTOREFS_MAGIC
+        0x73636673, // SECURITYFS_MAGIC
+        0xf97cff8c, // SELINUX_MAGIC
+        0x62656572, // SYSFS_MAGIC
+        0x74726163 // TRACEFS_MAGIC
+        => true,
+        else => false,
     };
+    kernfs_cache.put(dev, iskern) catch {};
+    return iskern;
 }
 
-// Read and index entries of the given dir. The entry for the directory is already assumed to be in 'parents'.
+const Context = struct {
+    parents: model.Parents = .{},
+    path: std.ArrayList(u8) = std.ArrayList(u8).init(main.allocator),
+    path_indices: std.ArrayList(usize) = std.ArrayList(usize).init(main.allocator),
+
+    // 0-terminated name of the top entry, points into 'path', invalid after popPath().
+    // This is a workaround to Zig's directory iterator not returning a [:0]const u8.
+    name: [:0]const u8 = undefined,
+
+    const Self = @This();
+
+    fn pushPath(self: *Self, name: []const u8) !void {
+        try self.path_indices.append(self.path.items.len);
+        if (self.path.items.len > 1) try self.path.append('/');
+        const start = self.path.items.len;
+        try self.path.appendSlice(name);
+
+        try self.path.append(0);
+        self.name = self.path.items[start..self.path.items.len-1:0];
+        self.path.items.len -= 1;
+    }
+
+    fn popPath(self: *Self) void {
+        self.path.items.len = self.path_indices.items[self.path_indices.items.len-1];
+        self.path_indices.items.len -= 1;
+    }
+};
+
+// Read and index entries of the given dir. The entry for the directory is already assumed to be in 'ctx.parents'.
 // (TODO: shouldn't error on OOM but instead call a function that waits or something)
-fn scanDir(parents: *model.Parents, dir: std.fs.Dir) std.mem.Allocator.Error!void {
+fn scanDir(ctx: *Context, dir: std.fs.Dir) std.mem.Allocator.Error!void {
     var it = dir.iterate();
     while(true) {
         const entry = it.next() catch {
-            parents.top().entry.set_err(parents);
+            ctx.parents.top().entry.set_err(&ctx.parents);
             return;
         } orelse break;
 
-        // TODO: Check for exclude patterns
+        try ctx.pushPath(entry.name);
+        defer ctx.popPath();
 
-        // XXX: Surely the name already has a trailing \0 in the buffer received by the OS?
-        // XXX#2: Does this allocate PATH_MAX bytes on the stack for each level of recursion!?
-        const name_z = std.os.toPosixPath(entry.name) catch undefined;
-        var stat = readStat(dir, &name_z, false) catch {
+        // XXX: This algorithm is extremely slow, can be optimized with some clever pattern parsing.
+        const excluded = blk: {
+            for (main.config.exclude_patterns.items) |pat| {
+                ctx.path.append(0) catch unreachable;
+                var path = ctx.path.items[0..ctx.path.items.len-1:0];
+                ctx.path.items.len -= 1;
+                while (path.len > 0) {
+                    if (c_fnmatch.fnmatch(pat, path, 0) == 0) break :blk true;
+                    if (std.mem.indexOfScalar(u8, path, '/')) |idx| path = path[idx+1..:0]
+                    else break;
+                }
+            }
+            break :blk false;
+        };
+        if (excluded) {
             var e = try model.Entry.create(.file, false, entry.name);
-            e.insert(parents) catch unreachable;
-            e.set_err(parents);
+            e.file().?.excluded = true;
+            e.insert(&ctx.parents) catch unreachable;
+            continue;
+        }
+
+        var stat = Stat.read(dir, ctx.name, false) catch {
+            var e = try model.Entry.create(.file, false, entry.name);
+            e.insert(&ctx.parents) catch unreachable;
+            e.set_err(&ctx.parents);
             continue;
         };
 
-        if (main.config.same_fs and stat.dev != model.getDev(parents.top().dev)) {
+        if (main.config.same_fs and stat.dev != model.getDev(ctx.parents.top().dev)) {
             var e = try model.Entry.create(.file, false, entry.name);
             e.file().?.other_fs = true;
-            e.insert(parents) catch unreachable;
+            e.insert(&ctx.parents) catch unreachable;
             continue;
         }
 
         if (main.config.follow_symlinks and stat.symlink) {
-            if (readStat(dir, &name_z, true)) |nstat| {
+            if (Stat.read(dir, ctx.name, true)) |nstat| {
                 if (!nstat.dir) {
                     stat = nstat;
                     // Symlink targets may reside on different filesystems,
                     // this will break hardlink detection and counting so let's disable it.
-                    if (stat.nlink > 1 and stat.dev != model.getDev(parents.top().dev))
+                    if (stat.nlink > 1 and stat.dev != model.getDev(ctx.parents.top().dev))
                         stat.nlink = 1;
                 }
             } else |_| {}
         }
 
-        // TODO: Check for kernfs; Zig has no wrappers for fstatfs() yet and calling the syscall directly doesn't seem too trivial. :(
-
         var edir =
-            if (stat.dir) dir.openDirZ(&name_z, .{ .access_sub_paths = true, .iterate = true, .no_follow = true }) catch {
+            if (stat.dir) dir.openDirZ(ctx.name, .{ .access_sub_paths = true, .iterate = true, .no_follow = true }) catch {
                 var e = try model.Entry.create(.file, false, entry.name);
-                e.insert(parents) catch unreachable;
-                e.set_err(parents);
+                e.insert(&ctx.parents) catch unreachable;
+                e.set_err(&ctx.parents);
                 continue;
             } else null;
         defer if (edir != null) edir.?.close();
 
+        if (std.builtin.os.tag == .linux and main.config.exclude_kernfs and stat.dir and isKernfs(edir.?, stat.dev)) {
+            var e = try model.Entry.create(.file, false, entry.name);
+            e.file().?.kernfs = true;
+            e.insert(&ctx.parents) catch unreachable;
+            continue;
+        }
+
         if (main.config.exclude_caches and stat.dir) {
             if (edir.?.openFileZ("CACHEDIR.TAG", .{})) |f| {
                 const sig = "Signature: 8a477f597d28d172789f06886806bc55";
@@ -124,7 +206,7 @@ fn scanDir(parents: *model.Parents, dir: std.fs.Dir) std.mem.Allocator.Error!voi
                     if (len == sig.len and std.mem.eql(u8, &buf, sig)) {
                         var e = try model.Entry.create(.file, false, entry.name);
                         e.file().?.excluded = true;
-                        e.insert(parents) catch unreachable;
+                        e.insert(&ctx.parents) catch unreachable;
                         continue;
                     }
                 } else |_| {}
@@ -148,12 +230,12 @@ fn scanDir(parents: *model.Parents, dir: std.fs.Dir) std.mem.Allocator.Error!voi
             l.nlink = stat.nlink;
         }
         if (e.ext()) |ext| ext.* = stat.ext;
-        try e.insert(parents);
+        try e.insert(&ctx.parents);
 
         if (e.dir()) |d| {
-            try parents.push(d);
-            try scanDir(parents, edir.?);
-            parents.pop();
+            try ctx.parents.push(d);
+            try scanDir(ctx, edir.?);
+            ctx.parents.pop();
         }
     }
 }
@@ -162,14 +244,15 @@ pub fn scanRoot(path: []const u8) !void {
     const full_path = std.fs.realpathAlloc(main.allocator, path) catch path;
     model.root = (try model.Entry.create(.dir, false, full_path)).dir().?;
 
-    const stat = try readStat(std.fs.cwd(), model.root.entry.name(), true);
+    const stat = try Stat.read(std.fs.cwd(), model.root.entry.name(), true);
     if (!stat.dir) return error.NotADirectory;
     model.root.entry.blocks = stat.blocks;
     model.root.entry.size = stat.size;
     model.root.dev = try model.getDevId(stat.dev);
     if (model.root.entry.ext()) |ext| ext.* = stat.ext;
 
-    var parents = model.Parents{};
+    var ctx = Context{};
+    try ctx.pushPath(full_path);
     const dir = try std.fs.cwd().openDirZ(model.root.entry.name(), .{ .access_sub_paths = true, .iterate = true });
-    try scanDir(&parents, dir);
+    try scanDir(&ctx, dir);
 }
diff --git a/src/ui.zig b/src/ui.zig
index 88410e0..dff3642 100644
--- a/src/ui.zig
+++ b/src/ui.zig
@@ -6,7 +6,6 @@ const main = @import("main.zig");
 pub const c = @cImport({
     @cInclude("stdio.h");
     @cInclude("string.h");
-    @cInclude("unistd.h");
     @cInclude("curses.h");
 });
 
@@ -121,7 +120,7 @@ pub fn init() void {
         if (term == null) die("Error initializing ncurses.\n", .{});
         _ = c.set_term(term);
     } else {
-        if (c.isatty(0) != 1) die("Standard input is not a TTY. Did you mean to import a file using '-f -'?\n", .{});
+        if (!std.io.getStdIn().isTty()) die("Standard input is not a TTY. Did you mean to import a file using '-f -'?\n", .{});
         if (c.initscr() == null) die("Error initializing ncurses.\n", .{});
     }
     updateSize();
author	Yorhel <git@yorhel.nl>	2021-05-03 14:41:48 +0200
committer	Yorhel <git@yorhel.nl>	2021-05-03 14:41:50 +0200
commit	a28a0788c3a7a697e73fe07347e9ff036edd64fc (patch)
tree	5487ff135650143f22904e2b80ea7da52cb3c63a
parent	826c2fc067a49305042403f05304931f44ad06e0 (diff)