Is it possible to access the ZFS checksums to compare files

I have a FreeNAS 8 machine (FreeBSD 8.2-RELEASE-p1) with two different shares on it. I'm migrating files from Share1 to Share2. The Shares have different directory structures and naming conventions. I'm looking for a way to verify that a file on Share1 actually exists (and is accurate) on Share2.

My plan is basically to checksum every file on both Shares and make sure that all the checksums in the Share1 set exist in the Share2 set. I can do this relatively easily with a bash script, but it'll take a long time to generate each of the checksum sets. My question is: Is it possible to access the native ZFS checksums to use them for this comparison?


I don't believe it is possible to extract the block level checksums from a ZFS filesystem, but since the checksums are at the block level, not the file level, it probably wouldn't help you anyway.

A while back I wrote a couple of programs for producing a manifest of a filesystem and for comparing an existing filesystem with that manifest.

Here's chksum.pl

#! /usr/bin/perl
#
# generate a system manifest
#

use strict;

use File::Find;
use Digest::MD5;

our %config;

$| = 1;

&main;
exit;

sub initialize {
    my ($exclude_next, $include_next, @ex, @in, @ex_pats, @in_pats, %in, @x);

    # the exclude/include list looks like what you would pass to rsync
    my $elist = ['--exclude' => '/var/log',
                 '--exclude' => '/var/tmp',
                 '--exclude' => '/tmp',
                ];

    foreach my $op (@$elist) {
        if ($op eq '--exclude') {
            $exclude_next = 1;
        } elsif ($op eq '--include') {
            $include_next = 1;
        } elsif ($op eq '--delete') {
            # ignore
        } else {
            if ($exclude_next) {
                push(@ex, $op);
            } elsif ($include_next) {
                push(@in, $op);
            } else {
                warn "don't know what to do with $op\n";
            }
            $include_next = 0; $exclude_next = 0;
        }
    }

    $config{'exclude_list'} = \@ex;
    $config{'include_list'} = \@in;
    @ex_pats = map(glob($_), @ex);
    @in_pats = map(glob($_), @in);

#    print STDERR "exclusion patterns:\n", join("\n", @ex_pats), "\n\n";
#    print STDERR "inclusion patterns:\n", join("\n", @in_pats), "\n\n";

    # remove exclusions that exactly match inclusions
    foreach my $pat (@in_pats) {
        $in{$pat}++;
    }
    foreach my $pat (@ex_pats) {
        next if $in{$pat};
        push(@x, $pat);
    }

    $config{'ex_pats'} = \@x;
}

sub wanted {
    my($type, $extra);

    my $path = $File::Find::name;

    my @ex = grep($path =~ m"^$_", @{$config{'ex_pats'}});

    my($mode, $uid, $gid, $dev) = (lstat($path))[2,4,5,6];

    if (scalar(@ex) > 0) {
        # if we're excluding, and it was a dir, don't bother to descend
        $File::Find::prune = -d _;
        return;
    }

    $mode &= 07777;                             # mask off mode bits
    if (-d _) {
        $type = "dir ";
    } elsif (-l _) {
        $type = "link";
        $extra = "\"" . readlink($path) . "\"";
    } elsif (-f _) {
        $type = "file";
        $extra = &md5sum($path);
        return unless defined($extra);
    } elsif (-p _) {
        $type = "pipe";
    } elsif (-S _) {
        $type = "sock";
    } elsif (-b _) {
        $type = "bdev";
        $extra = "\"" . ($dev >> 8) . " " . ($dev & 0377) . "\"";
    } elsif (-c _) {
        $type = "cdev";
        $extra = "\"" . ($dev >> 8) . " " . ($dev & 0377) . "\"";
    } else {
        $type = "unk ";
    }


    printf "%s 0%04o %d %d \"%s\"", $type, $mode, $uid, $gid, $path;
    print " $extra" if $extra;
    print "\n";
}

sub md5sum {
    my $file = shift;
    my $context = Digest::MD5->new;
    unless(open(F, $file)) {
        warn "WARNING: couldn't open $file: $!\n";
        return;
    }
    $context->addfile(*F);
    close(F);
    my $digest = $context->hexdigest;
    return($digest);
}

sub main {
    &initialize;

    &find({no_chdir => 1, wanted => \&wanted}, '/');
}

and here's verify_sum.pl

#! /usr/bin/perl

# verify that a system matches the sum file produced by chksum.pl

use strict;

use File::Find;
use Digest::MD5;
use Text::ParseWords;

our %config;

$| = 1;

&main;
exit;

sub initialize {
    my ($exclude_next, $include_next, @ex, @in, @ex_pats, @in_pats, %in, @x);

    my $elist = ['--exclude' => '/var/log',
                 '--exclude' => '/var/tmp',
                 '--exclude' => '/tmp',
                ];

    foreach my $op (@$elist) {
        if ($op eq '--exclude') {
            $exclude_next = 1;
        } elsif ($op eq '--include') {
            $include_next = 1;
        } elsif ($op eq '--delete') {
            # ignore
        } else {
            if ($exclude_next) {
                push(@ex, $op);
            } elsif ($include_next) {
                push(@in, $op);
            } else {
                warn "don't know what to do with $op\n";
            }
            $include_next = 0; $exclude_next = 0;
        }
    }

    $config{'exclude_list'} = \@ex;
    $config{'include_list'} = \@in;
    @ex_pats = map(glob($_), @ex);
    @in_pats = map(glob($_), @in);

#    print STDERR "exclusion patterns:\n", join("\n", @ex_pats), "\n\n";
#    print STDERR "inclusion patterns:\n", join("\n", @in_pats), "\n\n";

    # remove exclusions that exactly match inclusions
    foreach my $pat (@in_pats) {
        $in{$pat}++;
    }
    foreach my $pat (@ex_pats) {
        next if $in{$pat};
        push(@x, $pat);
    }

    $config{'ex_pats'} = \@x;
}

sub md5sum {
    my $file = shift;
    my $context = Digest::MD5->new;
    unless(open(F, $file)) {
        warn "WARNING: couldn't open $file: $!\n";
        return;
    }
    $context->addfile(*F);
    close(F);
    my $digest = $context->hexdigest;
    return($digest);
}

sub wanted {
    my $path = $File::Find::name;

    my @ex = grep($path =~ m"^$_", @{$config{'ex_pats'}});

    if (scalar(@ex) > 0) {
        # if we're excluding, and it was a dir, don't bother to descend
        $file::Find::prune = -d $path;
        return;
    }

    if (! $config{'seen'}->{$path}) {
        warn "new file: $path\n";
        $config{'new'}++;
    }
}

sub main {
    $config{'mismatch'} = 0;
    $config{'missing'} = 0;
    $config{'new'} = 0;

    &initialize;

    while (<>) {
        chomp;
        my ($type, $mode, $uid, $gid, $path, $extra) = &shellwords($_);
        my ($etype, $eextra);

        $mode = oct($mode);
        my($emode, $euid, $egid, $edev) = (lstat($path))[2,4,5,6];

        if (! -e _) {
            warn "missing file: $path\n";
            $config{'missing'}++;
            next;
        }

        $emode &= 07777;
        if (-d _) {
            $etype = "dir";
        } elsif (-l _) {
            $etype = "link";
            $eextra = readlink($path);
        } elsif (-f _) {
            $etype = "file";
            $eextra = &md5sum($path);
            return unless defined($eextra);
        } elsif (-p _) {
            $etype = "pipe";
        } elsif (-S _) {
            $etype = "sock";
        } elsif (-b _) {
            $etype = "bdev";
            $eextra = ($edev >> 8) . " " . ($edev & 0377);
        } elsif (-c _) {
            $etype = "cdev";
            $eextra = ($edev >> 8) . " " . ($edev & 0377);
        } else {
            $etype = "unk";
        }

        if ($type ne $etype) {
            warn "mismatch file type ($type vs $etype): $path\n";
            $config{'mismatch'}++;
        } else {
            # types are the same, compare other factors

            if ($mode != $emode) {
                warn sprintf("mismatch mode (0%o vs 0%o): %s\n", $mode, $emode, $path);
                $config{'mismatch'}++;
            }
            if ($uid != $euid) {
                warn "mismatch uid ($uid vs $euid): $path\n";
                $config{'mismatch'}++;
            }
            if ($gid != $egid) {
                warn "mismatch gid ($gid vs $egid): $path\n";
                $config{'mismatch'}++;
            }
            if ($extra ne $eextra) {
                if ($etype eq 'link') {
                    warn "mismatch link target ($extra vs $eextra): $path\n";
                } elsif ($etype eq 'file') {
                    warn "mismatch file checksum ($extra vs $eextra): $path\n";
                } elsif ($etype eq 'bdev' or $etype eq 'cdev') {
                    warn "mismatch device node ($extra vs $eextra): $path\n";
                }
                $config{'mismatch'}++;
            }
        }

        $config{'seen'}->{$path}++;
    }

    # now walk the filesystem looking for "new" files

    &find({no_chdir => 1, wanted => \&wanted}, '/');

    print "$config{'mismatch'} files changed\n";
    print "$config{'missing'} files missing\n";
    print "$config{'new'} new files\n";

    if ($config{'mismatch'} > 0 or
        $config{'missing'} > 0 or
        $config{'new'} > 0) {
        exit 1;
    }

    exit 0;
}

You'll probably want to tweak these scripts a bit for things like the exclude list, root directory of the comparison, etc.