#!/usr/bin/perl # $Id$ # Copyright (C) 1995-1997 by Zygo Blaxell # Use, modification, and distribution permitted # under the terms of the GNU GPL. $report_only=$ENV{'dupescan_report_only'}; $xdev=$report_only?"":"-xdev"; $*=1; $/="\0"; ($USER=getpwuid($<)) || die "Can't find your username ($<): $!"; $USER=$ENV{'dupescan_user'} || $USER; $ENV{'dupescan_user'}='*' unless $>; $userparam=$ENV{'dupescan_user'} eq '*' ? "" : ( $USER ? "-user $USER" : "-nouser"); if (open(FIND,"find $xdev $userparam -type f ! -empty -printf 'unknown %s %i %p\\0' |")) { while () { ($sum,$size,$inode,$name)=m/^(\S+) (\d+) (\d+) (.+)\0$/; $inode_to_size{$inode}=$size; $inode_to_name{$inode}.="$name\0"; $name_to_inode{$name}=$inode; $size_to_inode{$size}=join("\0",grep($_ != $inode,split(/\0/,$size_to_inode{$size})),$inode); $inode_to_sum{$inode}=$sum; } } close(FIND); # Add previously generated checksums for old files if (open(DB,"<.dupe1db")) { while () { ($sum,$size,$inode,$name)=m/^(\S+) (\d+) (\d+) (.+)\0$/; #print STDERR "sum=$sum, size=$size, inode=$inode, name=$name\n"; next unless ($name_to_inode{$name}==$inode && $inode_to_size{$inode}==$size && $inode_to_sum{$inode} eq 'unknown'); #print STDERR "(accepting)\n"; $inode_to_sum{$inode}=$sum; } } close(DB); if (open(DB,"<.dupe2db")) { while () { ($sum,$size,$inode,$otherinode)=m/^(\S+) (\d+) (\d+) (\d+)\0$/; #print STDERR "sum=$sum, size=$size, inode=$inode, otherinode=$otherinode\n"; next unless ($inode_to_size{$inode}==$size && ($inode_to_sum{$inode} eq $sum)); #print STDERR "(accepting)\n"; $are_different{"$inode,$otherinode"}++; } } close(DB); $saved_total=0; inode: foreach $size (sort { $b <=> $a } keys(%size_to_inode)) { tryloop: until (0) { @inodelist=grep(defined($inode_to_size{$_}),split(/\0/,$size_to_inode{$size})); next inode unless $inodelist[1]; print STDERR "Inodes of size $size: @inodelist\n"; foreach $inode (@inodelist) { if ($inode_to_sum{$inode} eq 'unknown') { $inode_to_sum{$inode}=&do_sum($inode_to_name{$inode}); } else { print STDERR "Already know checksum of $inode is $inode_to_sum{$inode}\n"; } } foreach $i (0..$#inodelist) { $ii=$inodelist[$i]; (@fn1)=split(/\0/,$inode_to_name{$ii}); foreach $j (($i+1)..$#inodelist) { $ij=$inodelist[$j]; next if $inode_to_sum{$ii} ne $inode_to_sum{$ij}; if ($are_different{"$ii,$ij"} || $are_different{"$ij,$ii"}) { print STDERR "Inodes $ii and $ij have identical checksum but are known to be different\n"; next; } print STDERR "Inodes $ii and $ij have identical checksum\n"; (@fn2)=split(/\0/,$inode_to_name{$ij}); print STDERR "Comparing @fn1 with @fn2..."; if (&do_cmp($fn1[0],$fn2[0])) { print STDERR "Different.\n"; $are_different{"$ii,$ij"}++; next; } else { print STDERR "Identical.\n"; $saved_total+=$size; if ($report_only) { print STDOUT "@fn1 and @fn2 are identical ($size bytes).\n"; } else { if ( (@fn1)+0 > (@fn2)+0 ) { @temp=@fn1; $temp=$ii; @fn1=@fn2; $ii=$ij; @fn2=@temp; $ij=$temp; } foreach $target (@fn1) { foreach $source (@fn2) { unlink(".dupelink.$$"); #print STDERR "link($source,.dupelink.$$)..."; warn "link $source, .dupelink.$$: $!" unless link($source,".dupelink.$$"); #print STDERR "rename(.dupelink.$$,$target)..."; last if rename(".dupelink.$$",$target); warn "rename .dupelink.$$, $target: $!"; unlink(".dupelink.$$") || die "Can't delete .dupelink.$$. Exiting for safety reasons: $!"; } #print STDERR "\n"; } system("chmod","-v","a-w,og+r",$fn1[0]); system("chown","-v","0.0",$fn1[0]) unless $>; } # mangle connectivity $inode_to_name{$ij}=join("\0",sort(@fn1,@fn2)); delete($inode_to_name{$ii}); delete($inode_to_sum{$ii}); delete($inode_to_size{$ii}); foreach $name (@fn1) { $name_to_inode{$name}=$ij; } $done=0; next tryloop; } } } last tryloop; } } open(DB,">.dupe1db") || die "open .dupe1db: $!"; foreach $inode (keys(%inode_to_sum)) { $size=$inode_to_size{$inode}; $sum=$inode_to_sum{$inode}; next if $sum eq 'unknown'; foreach $name (split(/\0/,$inode_to_name{$inode})) { $name || next; print DB sprintf("%s %d %d %s\0",$sum,$size,$inode,$name); } } close(DB); open(DB,">.dupe2db") || die "open .dupe2db: $!"; foreach $pair (keys(%are_different)) { ($inode,$otherinode)=split(/,/,$pair); $size=$inode_to_size{$inode}; $sum=$inode_to_sum{$inode}; warn "This shouldn't happen!" if $sum eq 'unknown'; print DB sprintf("%s %d %d %d\0",$sum,$size,$inode,$otherinode); } close(DB); print STDERR "Database updated. Done (saved $saved_total bytes)\n"; sub do_sum { ($the_file=$_[0]) =~ s/\0.*$//; # Crude, but very effective unless (open(INSUM,"-|")) { open(STDIN,"<$the_file\0") || die "open $the_file: $!"; exec("md5sum") || die "exec: $!"; } chop($checksum=); $checksum =~ s/^([\da-fA-F]+).*$/$1/; unless (close(INSUM)) { warn "Checksum exited, status $?"; $checknum++; $checksum="not-a-checksum-$checknum"; } die "Checksum failed: got $checksum" unless $checksum =~ /[a-fA-F0-9]/o; print STDERR "Checksum of $the_file is $checksum\n"; return $checksum; } sub do_cmp { ($file1,$file2) = @_; $file1 =~ s/\0.*$//; $file2 =~ s/\0.*$//; # Crude, but effective. system("cmp","-s",$file1,$file2); return $?; }