summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--src/exclude.zig322
-rw-r--r--src/main.zig7
-rw-r--r--src/scan.zig34
3 files changed, 343 insertions, 20 deletions
diff --git a/src/exclude.zig b/src/exclude.zig
new file mode 100644
index 0000000..8e2b793
--- /dev/null
+++ b/src/exclude.zig
@@ -0,0 +1,322 @@
+// SPDX-FileCopyrightText: 2021-2022 Yoran Heling <projects@yorhel.nl>
+// SPDX-License-Identifier: MIT
+
+const std = @import("std");
+const main = @import("main.zig");
+const c = @cImport(@cInclude("fnmatch.h"));
+
+// Reference:
+// https://manned.org/glob.7
+// https://manned.org/man.b4c7391e/rsync#head17
+// https://manned.org/man.401d6ade/arch/gitignore#head4
+// Patterns:
+// Single component (none of these patterns match a '/'):
+// * -> match any character sequence
+// ? -> match single character
+// [abc] -> match a single character in the given list
+// [a-c] -> match a single character in the given range
+// [!a-c] -> match a single character not in the given range
+// # (these are currently still handled by calling libc fnmatch())
+// Anchored patterns:
+// /pattern
+// /dir/pattern
+// /dir/subdir/pattern
+// # In both rsync and gitignore, anchored patterns are relative to the
+// # directory under consideration. In ncdu they are instead anchored to
+// # the filesystem root (i.e. matched against the absolute path).
+// Non-anchored patterns:
+// somefile
+// subdir/foo
+// sub*/bar
+// # In .gitignore, non-anchored patterns with a slash are implicitely anchored,
+// # in rsync they can match anywhere in a path. We follow rsync here.
+// Dir patterns (trailing '/' matches only dirs):
+// /pattern/
+// somedir/
+// subdir/pattern/
+//
+// BREAKING CHANGE:
+// ncdu < 2.2 single-component matches may cross directory boundary, e.g.
+// 'a*b' matches 'a/b'. This is an old bug, the fix breaks compatibility with
+// old exlude patterns.
+
+const Pattern = struct {
+ isdir: bool = undefined,
+ isliteral: bool = undefined,
+ pattern: [:0]const u8,
+ sub: ?*const Pattern = undefined,
+
+ fn isLiteral(str: []const u8) bool {
+ for (str) |chr| switch (chr) {
+ '[', '*', '?', '\\' => return false,
+ else => {},
+ };
+ return true;
+ }
+
+ fn parse(pat_: []const u8) *const Pattern {
+ var pat = std.mem.trimLeft(u8, pat_, "/");
+ var top = main.allocator.create(Pattern) catch unreachable;
+ var tail = top;
+ tail.sub = null;
+ while (std.mem.indexOfScalar(u8, pat, '/')) |idx| {
+ tail.pattern = main.allocator.dupeZ(u8, pat[0..idx]) catch unreachable;
+ tail.isdir = true;
+ tail.isliteral = isLiteral(tail.pattern);
+ pat = pat[idx+1..];
+ if (std.mem.allEqual(u8, pat, '/')) return top;
+
+ const next = main.allocator.create(Pattern) catch unreachable;
+ tail.sub = next;
+ tail = next;
+ tail.sub = null;
+ }
+ tail.pattern = main.allocator.dupeZ(u8, pat) catch unreachable;
+ tail.isdir = false;
+ tail.isliteral = isLiteral(tail.pattern);
+ return top;
+ }
+};
+
+test "parse" {
+ const t1 = Pattern.parse("");
+ try std.testing.expectEqualStrings(t1.pattern, "");
+ try std.testing.expectEqual(t1.isdir, false);
+ try std.testing.expectEqual(t1.isliteral, true);
+ try std.testing.expectEqual(t1.sub, null);
+
+ const t2 = Pattern.parse("//a//");
+ try std.testing.expectEqualStrings(t2.pattern, "a");
+ try std.testing.expectEqual(t2.isdir, true);
+ try std.testing.expectEqual(t2.isliteral, true);
+ try std.testing.expectEqual(t2.sub, null);
+
+ const t3 = Pattern.parse("foo*/bar.zig");
+ try std.testing.expectEqualStrings(t3.pattern, "foo*");
+ try std.testing.expectEqual(t3.isdir, true);
+ try std.testing.expectEqual(t3.isliteral, false);
+ try std.testing.expectEqualStrings(t3.sub.?.pattern, "bar.zig");
+ try std.testing.expectEqual(t3.sub.?.isdir, false);
+ try std.testing.expectEqual(t3.sub.?.isliteral, true);
+ try std.testing.expectEqual(t3.sub.?.sub, null);
+
+ const t4 = Pattern.parse("/?/sub/dir/");
+ try std.testing.expectEqualStrings(t4.pattern, "?");
+ try std.testing.expectEqual(t4.isdir, true);
+ try std.testing.expectEqual(t4.isliteral, false);
+ try std.testing.expectEqualStrings(t4.sub.?.pattern, "sub");
+ try std.testing.expectEqual(t4.sub.?.isdir, true);
+ try std.testing.expectEqual(t4.sub.?.isliteral, true);
+ try std.testing.expectEqualStrings(t4.sub.?.sub.?.pattern, "dir");
+ try std.testing.expectEqual(t4.sub.?.sub.?.isdir, true);
+ try std.testing.expectEqual(t4.sub.?.sub.?.isliteral, true);
+ try std.testing.expectEqual(t4.sub.?.sub.?.sub, null);
+}
+
+
+// List of patterns to be matched at one particular level.
+// There are 2 different types of lists: those where all patterns have a
+// sub-pointer (where the pattern only matches directories at this level, and
+// the match result is only used to construct the PatternList of the
+// subdirectory) and patterns without a sub-pointer (where the match result
+// determines whether the file/dir at this level should be included or not).
+fn PatternList(withsub: bool) type {
+ return struct {
+ literals: std.HashMapUnmanaged(*const Pattern, Val, Ctx, 80) = .{},
+ wild: std.ArrayListUnmanaged(*const Pattern) = .{},
+
+ // Not a fan of the map-of-arrays approach in the 'withsub' case, it
+ // has a lot of extra allocations. Linking the Patterns together in a
+ // list would be nicer, but that involves mutable Patterns, which in
+ // turn prevents multithreaded scanning. An alternative would be a
+ // sorted array + binary search, but that slows down lookups. Perhaps a
+ // custom hashmap with support for duplicate keys?
+ const Val = if (withsub) std.ArrayListUnmanaged(*const Pattern) else void;
+
+ const Ctx = struct {
+ pub fn hash(_: Ctx, p: *const Pattern) u64 {
+ return std.hash.Wyhash.hash(0, p.pattern);
+ }
+ pub fn eql(_: Ctx, a: *const Pattern, b: *const Pattern) bool {
+ return std.mem.eql(u8, a.pattern, b.pattern);
+ }
+ };
+
+ const Self = @This();
+
+ fn append(self: *Self, pat: *const Pattern) void {
+ std.debug.assert((pat.sub != null) == withsub);
+ if (pat.isliteral) {
+ var e = self.literals.getOrPut(main.allocator, pat) catch unreachable;
+ if (!e.found_existing) {
+ e.key_ptr.* = pat;
+ e.value_ptr.* = .{};
+ }
+ if (!withsub and !pat.isdir and e.key_ptr.*.isdir) e.key_ptr.* = pat;
+ if (withsub) {
+ if (pat.sub) |s| e.value_ptr.*.append(main.allocator, s) catch unreachable;
+ }
+
+ } else self.wild.append(main.allocator, pat) catch unreachable;
+ }
+
+ fn match(self: *const Self, name: [:0]const u8) ?bool {
+ var ret: ?bool = null;
+ if (self.literals.getKey(&.{ .pattern = name })) |p| ret = p.isdir;
+ for (self.wild.items) |p| {
+ if (ret == false) return ret;
+ if (c.fnmatch(p.pattern, name, 0) == 0) ret = p.isdir;
+ }
+ return ret;
+ }
+
+ fn enter(self: *const Self, out: *Patterns, name: [:0]const u8) void {
+ if (self.literals.get(&.{ .pattern = name })) |lst| for (lst.items) |sub| out.append(sub);
+ for (self.wild.items) |p| if (c.fnmatch(p.pattern, name, 0) == 0) out.append(p.sub.?);
+ }
+
+ fn deinit(self: *Self) void {
+ if (withsub) {
+ var it = self.literals.valueIterator();
+ while (it.next()) |e| e.deinit(main.allocator);
+ }
+ self.literals.deinit(main.allocator);
+ self.wild.deinit(main.allocator);
+ self.* = undefined;
+ }
+ };
+}
+
+// List of all patterns that should be matched at one level.
+pub const Patterns = struct {
+ nonsub: PatternList(false) = .{},
+ sub: PatternList(true) = .{},
+ isroot: bool = false,
+
+ fn append(self: *Patterns, pat: *const Pattern) void {
+ if (pat.sub == null) self.nonsub.append(pat)
+ else self.sub.append(pat);
+ }
+
+ // Matches patterns in this level plus unanchored patterns.
+ // Returns null if nothing matches, otherwise whether the given item should
+ // only be exluced if it's a directory.
+ // (Should not be called on root_unanchored)
+ pub fn match(self: *const Patterns, name: [:0]const u8) ?bool {
+ const a = self.nonsub.match(name);
+ if (a == false) return false;
+ const b = root_unanchored.nonsub.match(name);
+ if (b == false) return false;
+ return a orelse b;
+ }
+
+ // Construct the list of patterns for a subdirectory.
+ pub fn enter(self: *const Patterns, name: [:0]const u8) Patterns {
+ var ret = Patterns{};
+ self.sub.enter(&ret, name);
+ root_unanchored.sub.enter(&ret, name);
+ return ret;
+ }
+
+ pub fn deinit(self: *Patterns) void {
+ // getPatterns() result should be deinit()ed, except when it returns the root,
+ // let's simplify that and simply don't deinit root.
+ if (self.isroot) return;
+ self.nonsub.deinit();
+ self.sub.deinit();
+ self.* = undefined;
+ }
+};
+
+// Unanchored patterns that should be checked at every level
+var root_unanchored: Patterns = .{};
+
+// Patterns anchored at the root
+var root: Patterns = .{ .isroot = true };
+
+pub fn addPattern(pattern: []const u8) void {
+ if (pattern.len == 0) return;
+ const p = Pattern.parse(pattern);
+ if (pattern[0] == '/') root.append(p)
+ else root_unanchored.append(p);
+}
+
+// Get the patterns for the given (absolute) path, assuming the given path
+// itself hasn't been excluded. This function is slow, directory walking code
+// should use Patterns.enter() instead.
+pub fn getPatterns(path_: []const u8) Patterns {
+ var path = std.mem.trim(u8, path_, "/");
+ if (path.len == 0) return root;
+ var pat = root;
+ defer pat.deinit();
+ while (std.mem.indexOfScalar(u8, path, '/')) |idx| {
+ var name = main.allocator.dupeZ(u8, path[0..idx]) catch unreachable;
+ defer main.allocator.free(name);
+ path = path[idx+1..];
+
+ var sub = pat.enter(name);
+ pat.deinit();
+ pat = sub;
+ }
+
+ var name = main.allocator.dupeZ(u8, path) catch unreachable;
+ defer main.allocator.free(name);
+ return pat.enter(name);
+}
+
+
+fn testfoo(p: *const Patterns) !void {
+ try std.testing.expectEqual(p.match("root"), null);
+ try std.testing.expectEqual(p.match("bar"), false);
+ try std.testing.expectEqual(p.match("qoo"), false);
+ try std.testing.expectEqual(p.match("xyz"), false);
+ try std.testing.expectEqual(p.match("okay"), null);
+ try std.testing.expectEqual(p.match("somefile"), false);
+ var s = p.enter("okay");
+ try std.testing.expectEqual(s.match("bar"), null);
+ try std.testing.expectEqual(s.match("xyz"), null);
+ try std.testing.expectEqual(s.match("notokay"), false);
+ s.deinit();
+}
+
+test "Matching" {
+ addPattern("/foo/bar");
+ addPattern("/foo/qoo/");
+ addPattern("/foo/qoo");
+ addPattern("/foo/qoo/");
+ addPattern("/f??/xyz");
+ addPattern("/f??/xyz/");
+ addPattern("/*o/somefile");
+ addPattern("/a??/okay");
+ addPattern("/roo?");
+ addPattern("/root/");
+ addPattern("excluded");
+ addPattern("somefile/");
+ addPattern("o*y/not[o]kay");
+
+ var a0 = getPatterns("/");
+ try std.testing.expectEqual(a0.match("a"), null);
+ try std.testing.expectEqual(a0.match("excluded"), false);
+ try std.testing.expectEqual(a0.match("somefile"), true);
+ try std.testing.expectEqual(a0.match("root"), false);
+ var a1 = a0.enter("foo");
+ a0.deinit();
+ try testfoo(&a1);
+ a1.deinit();
+
+ var b0 = getPatterns("/somedir/somewhere");
+ try std.testing.expectEqual(b0.match("a"), null);
+ try std.testing.expectEqual(b0.match("excluded"), false);
+ try std.testing.expectEqual(b0.match("root"), null);
+ try std.testing.expectEqual(b0.match("okay"), null);
+ var b1 = b0.enter("okay");
+ b0.deinit();
+ try std.testing.expectEqual(b1.match("excluded"), false);
+ try std.testing.expectEqual(b1.match("okay"), null);
+ try std.testing.expectEqual(b1.match("notokay"), false);
+ b1.deinit();
+
+ var c0 = getPatterns("/foo/");
+ try testfoo(&c0);
+ c0.deinit();
+}
diff --git a/src/main.zig b/src/main.zig
index 70e61a5..593cb04 100644
--- a/src/main.zig
+++ b/src/main.zig
@@ -10,6 +10,7 @@ const ui = @import("ui.zig");
const browser = @import("browser.zig");
const delete = @import("delete.zig");
const util = @import("util.zig");
+const exclude = @import("exclude.zig");
const c = @cImport(@cInclude("locale.h"));
// "Custom" allocator that wraps the libc allocator and calls ui.oom() on error.
@@ -229,7 +230,7 @@ fn argConfig(args: *Args, opt: Args.Option) bool {
else if (opt.is("--no-si")) config.si = false
else if (opt.is("-L") or opt.is("--follow-symlinks")) config.follow_symlinks = true
else if (opt.is("--no-follow-symlinks")) config.follow_symlinks = false
- else if (opt.is("--exclude")) config.exclude_patterns.append(allocator.dupeZ(u8, args.arg()) catch unreachable) catch unreachable
+ else if (opt.is("--exclude")) exclude.addPattern(args.arg())
else if (opt.is("-X") or opt.is("--exclude-from")) {
const arg = args.arg();
readExcludeFile(arg) catch |e| ui.die("Error reading excludes from {s}: {s}.\n", .{ arg, ui.errorString(e) });
@@ -381,11 +382,12 @@ fn readExcludeFile(path: [:0]const u8) !void {
defer f.close();
var rd = std.io.bufferedReader(f.reader()).reader();
var buf = std.ArrayList(u8).init(allocator);
+ defer buf.deinit();
while (true) {
rd.readUntilDelimiterArrayList(&buf, '\n', 4096)
catch |e| if (e != error.EndOfStream) return e else if (buf.items.len == 0) break;
if (buf.items.len > 0)
- config.exclude_patterns.append(buf.toOwnedSliceSentinel(0) catch unreachable) catch unreachable;
+ exclude.addPattern(util.arrayListBufZ(&buf));
}
}
@@ -552,6 +554,7 @@ test "imports" {
_ = @import("model.zig");
_ = @import("ui.zig");
_ = @import("util.zig");
+ _ = @import("exclude.zig");
}
test "argument parser" {
diff --git a/src/scan.zig b/src/scan.zig
index 86f7061..68b74c8 100644
--- a/src/scan.zig
+++ b/src/scan.zig
@@ -6,8 +6,8 @@ const main = @import("main.zig");
const model = @import("model.zig");
const ui = @import("ui.zig");
const util = @import("util.zig");
+const exclude = @import("exclude.zig");
const c_statfs = @cImport(@cInclude("sys/vfs.h"));
-const c_fnmatch = @cImport(@cInclude("fnmatch.h"));
// Concise stat struct for fields we're interested in, with the types used by the model.
@@ -450,7 +450,7 @@ const Context = struct {
var active_context: *Context = undefined;
// Read and index entries of the given dir.
-fn scanDir(ctx: *Context, dir: std.fs.Dir, dir_dev: u64) void {
+fn scanDir(ctx: *Context, pat: *const exclude.Patterns, dir: std.fs.Dir, dir_dev: u64) void {
var it = main.allocator.create(std.fs.Dir.Iterator) catch unreachable;
defer main.allocator.destroy(it);
it.* = dir.iterate();
@@ -465,19 +465,8 @@ fn scanDir(ctx: *Context, dir: std.fs.Dir, dir_dev: u64) void {
defer ctx.popPath();
main.handleEvent(false, false);
- // XXX: This algorithm is extremely slow, can be optimized with some clever pattern parsing.
- const excluded = blk: {
- for (main.config.exclude_patterns.items) |pat| {
- var path = ctx.pathZ();
- while (path.len > 0) {
- if (c_fnmatch.fnmatch(pat, path, 0) == 0) break :blk true;
- if (std.mem.indexOfScalar(u8, path, '/')) |idx| path = path[idx+1..:0]
- else break;
- }
- }
- break :blk false;
- };
- if (excluded) {
+ const excluded = pat.match(ctx.name);
+ if (excluded == false) { // matched either a file or directory, so we can exclude this before stat()ing.
ctx.addSpecial(.excluded);
continue;
}
@@ -486,7 +475,6 @@ fn scanDir(ctx: *Context, dir: std.fs.Dir, dir_dev: u64) void {
ctx.addSpecial(.err);
continue;
};
-
if (main.config.same_fs and ctx.stat.dev != dir_dev) {
ctx.addSpecial(.other_fs);
continue;
@@ -503,6 +491,10 @@ fn scanDir(ctx: *Context, dir: std.fs.Dir, dir_dev: u64) void {
}
} else |_| {}
}
+ if (excluded) |e| if (e and ctx.stat.dir) {
+ ctx.addSpecial(.excluded);
+ continue;
+ };
var edir =
if (ctx.stat.dir) dir.openDirZ(ctx.name, .{ .access_sub_paths = true, .iterate = true, .no_follow = true }) catch {
@@ -530,7 +522,11 @@ fn scanDir(ctx: *Context, dir: std.fs.Dir, dir_dev: u64) void {
}
ctx.addStat(dir_dev);
- if (ctx.stat.dir) scanDir(ctx, edir.?, ctx.stat.dev);
+ if (ctx.stat.dir) {
+ var subpat = pat.enter(ctx.name);
+ defer subpat.deinit();
+ scanDir(ctx, &subpat, edir.?, ctx.stat.dev);
+ }
}
}
@@ -568,7 +564,9 @@ pub fn scan() void {
return;
};
defer dir.close();
- scanDir(active_context, dir, active_context.stat.dev);
+ var pat = exclude.getPatterns(active_context.pathZ());
+ defer pat.deinit();
+ scanDir(active_context, &pat, dir, active_context.stat.dev);
active_context.popPath();
active_context.final();
}