#!/usr/bin/perl -w # $Id$ # Copyright (C) 2001 by Zygo Blaxell # Use, modification, and distribution permitted # under the terms of the GNU GPL. use Fcntl; use strict; $/="\0"; umask(022) unless umask(); my $rv = open(FIND, "-|"); die "open: $!" unless defined($rv); unless ($rv) { exec('find',@ARGV,'(','-type','d','-name','.dupemerge','-prune','-o','-true',')','-type','f','-print0'); die "exec failed: $!"; } # Environment variable DUPEMERGE_LOSE_CREDS # The default is to treat two files with identical contents but # different user, group, modification time or permissions as two # distinct files. # Set the environment variable "DUPEMERGE_LOSE_CREDS" to any value # if you want all files with identical contents, regardless of # ownership, permissions, or modification time, to be considered identical. # Note that the permissions (and ownership, if run as root) will be changed # to root.root mode 444 to prevent files from being rendered unreadable # and to make it slightly more difficult to accidentally modify them. # Environment variable DUPEMERGE_HONOR_TIMESTAMPS # If DUPEMERGE_HONOR_TIMESTAMPS is set, files with identical contents but # different time stamps will still be considered distinct. # This only takes effect if DUPEMERGE_LOSE_CREDS is also set. my $distinct = !defined($ENV{DUPEMERGE_LOSE_CREDS}); my $honor_ts = !defined($ENV{DUPEMERGE_HONOR_TIMESTAMPS}); my $savings = 0; my $total = 0; my %seen = (); my $md5sums = 0; while () { chomp; my $file = $_; for my $try (1..3) { eval { # Get basic stats for file my ($file_dev, $file_ino, $file_mode, $file_nlink, $file_uid, $file_gid, $file_rdev, $file_size, $file_atime, $file_mtime, @file_others) = lstat($file); die "lstat: $file: $!" unless @file_others; # print STDERR "$file: $file_dev,$file_ino $file_size $file_uid:$file_gid, $file_mode, $file_mtime\n"; # Total size considered $total += $file_size; # Optimization: Calculate hash based on size of file # If distinct, hash based on size + mtime. my $quicker; if ($distinct) { $quicker = sprintf(".dupemerge/%02x/%02x/%u.%d.%d.%o.%u", $file_mtime & 0xff, ($file_mtime >> 8) & 0xff, $file_mtime, $file_uid, $file_gid, $file_mode, $file_size); } else { if ($honor_ts) { $quicker = sprintf(".dupemerge/%02x/%02x/%u.%u", $file_mtime & 0xff, ($file_mtime >> 8) & 0xff, $file_mtime, $file_size); } else { $quicker = sprintf(".dupemerge/%02x/%02x/%u", $file_size & 0xff, ($file_size >> 8) & 0xff, $file_size); } } # If the quicker file doesn't exist, just link it and skip. unless (-e $quicker) { # print STDERR "quicker=$quicker, file=$file, does not exist...\n"; my $quickpath = $quicker; $quickpath =~ s:/[^/]*$::o; &mkdir_p($quickpath); link($file, $quicker) or die "link: $file -> $quicker: $!"; return; } # print STDERR "quicker=$quicker, file=$file...\n"; # If the quicker file does exist, check its md5sum &get_sum($quicker); &get_sum($file); }; if ($@) { print STDERR $@; } else { warn "Succeeded on try #$try" if $try > 1; last; } } } print STDERR "Total input size processed: $total\n"; print STDERR "Total space saved: $savings\n"; print STDERR "Total md5sums: $md5sums\n"; exit(0); sub get_sum { my($file) = shift(@_); my ($file_dev, $file_ino, $file_mode, $file_nlink, $file_uid, $file_gid, $file_rdev, $file_size, $file_atime, $file_mtime, @file_others) = lstat($file); die "lstat: $file: $!" unless @file_others; # Optimization: If we have seen this file in .dupemerge, ignore it if (exists($seen{"$file_dev,$file_ino"})) { # print STDERR "Already processed $file_dev,$file_ino ($file)\n"; return; } # Get hash of file and split into directories $md5sums++; my $digest = &do_sum($file); # print STDERR "sum($file) = $digest\n"; $digest =~ s:^(..)(..):$1/$2/:o; # Add distinctiveness if ($distinct) { $digest = sprintf(".dupemerge/%s.%d.%d.%o.%u", $digest, $file_uid, $file_gid, $file_mode, $file_mtime); } else { if ($honor_ts) { $digest = sprintf(".dupemerge/%s.%u", $digest, $file_mtime); } else { $digest = sprintf(".dupemerge/%s", $digest); } } # Get path my $path = $digest; $path =~ s:/[^/]*$::o; # Look for file by MD5 signature my ($md5_dev, $md5_ino, $md5_mode, $md5_nlink, @md5_others) = lstat($digest); # Set owner and permissions? my $setop = 0; # Identifier we've seen (or just declare the variable) my $whichseen = "$md5_dev,$md5_ino" if defined($md5_ino); # If no MD5 known, make link from file to MD5 name. # Otherwise, replace file with MD5 link. if (!defined($md5_dev)) { # print STDERR "Linking: $file($file_dev,$file_ino) -> $digest\n"; &mkdir_p($path); link($file, $digest) or die "link: $file -> $digest: $!"; $whichseen = "$file_dev,$file_ino"; } elsif ($md5_dev != $file_dev || $md5_ino != $file_ino) { $setop = 1; print STDERR "Relinking: replacing $file($file_dev,$file_ino) with $digest($md5_dev,$md5_ino)\n"; link($digest, "$digest.link") or die "link: $digest -> $digest.link: $!"; rename("$digest.link", $file) or die "rename: $digest.link -> $file: $!"; $savings += $file_size; } # If non-distinct and we replaced digest with file, accumulate read permissions. # This is kind of bogus, sorry. if ($setop && !$distinct) { my $mode = $file_mode; $mode |= $md5_mode if defined($md5_mode); $mode = ($mode & 0111) ? 0555 : 0444; chmod($mode, $digest); # Ignore errors - usually done by root chown($<, $(, $digest); # Ignore errors - usually done by root } # Use this to short-circuit files we've already processed. $seen{$whichseen} = undef; } sub do_sum { my $the_file; ($the_file=$_[0]) =~ s/\0.*$//; # Crude, but very effective unless (open(INSUM,"-|")) { eval { open(STDIN,"<$the_file\0") || die "open $the_file: $!"; exec("md5sum") || die "exec: $!"; }; print STDERR $@; exit(1); } my $checksum; chop($checksum=); $checksum =~ s/^([\da-fA-F]+).*$/$1/; unless (close(INSUM)) { warn "Checksum exited, status $?"; $checksum = ''; } die "Checksum failed: got $checksum" unless $checksum =~ /[a-fA-F0-9]/o; # print STDERR "Checksum of $the_file is $checksum\n"; return $checksum; } sub mkdir_p { my $dir = shift(@_); return if ( -d $dir ); my ($head, $tail) = ($dir =~ m:^(.*)/([^/]*)$:o); &mkdir_p($head) if defined($head) && length($head); # If the mkdir fails, it's either because we can't create # the directory, or because someone else is making it for us # (e.g. if two dupemerge's running at once). # In the former case, the link/rename pair will also fail, # so the error will be detected and handled. In the latter # case, there is no problem, so we should not use 'die' here. mkdir($dir, 0777) or warn "mkdir: $dir: $!"; }