#!/usr/bin/tclsh # $Id$ # Copyright (C) 2001 by Zygo Blaxell # Use, modification, and distribution permitted # under the terms of the GNU GPL. load libdb_tcl.so # package require Itcl set preserve_timestamps 1 set preserve_access 1 set preserve_zeros 1 set no_sync 0 set queue_only 0 set verbose 0 set debug 0 set directories {} foreach arg $argv { switch -exact -- $arg { --timestamps { set preserve_timestamps 0 } --zeros { set preserve_zeros 0 } --access { set preserve_access 0 } --fast { set no_sync 1 } --queue { set queue_only 1 } --verbose { set verbose 1 } --debug { set debug 1 } default { lappend directories $arg } } } if {$preserve_timestamps} { puts stderr "Preserving timestamps." } else { puts stderr "Merging files with distinct timestamps." } if {$preserve_access} { puts stderr "Preserving ownership and access permissions." } else { puts stderr "Merging files with distinct ownership and access permissions." } if {$preserve_zeros} { puts stderr "Preserving zero-length files." } else { puts stderr "Merging files of zero length." } if {$no_sync} { puts stderr "Turning off synchronous database transactions." } else { puts stderr "Using synchronous database transactions." } if {$queue_only} { puts stderr "Queueing roots only." } else { puts stderr "Doing all the processing immediately." } if {$verbose} { proc quiet {args} { uplevel 1 $args } puts stderr "Being verbose." } else { proc quiet {args} {} puts stderr "Being quiet." } if {$debug} { proc debug {args} { uplevel 1 $args } puts stderr "Debugging information." } else { proc debug {args} {} } set handles_to_close {} foreach db {ihashes istats iqueue inames statis hashis} { puts -nonewline stderr "Opening database $db " set $db [berkdb open -btree] puts stderr "in handle [set $db]." lappend handles_to_close [set $db] } proc lookup_iname {child_id txn} { global dbenv inames ## puts -nonewline stderr "Looking up $child_id..." set names {} ## puts -nonewline stderr "Reading inames..." foreach dbt [$inames get $child_id] { # -txn $txn foreach {key value} $dbt { foreach {my_name parent_id} $value { foreach parent_name [lookup_iname $parent_id $txn] { lappend names [file join $parent_name $my_name] } } } } ## puts stderr "$child_id: $names" if {![llength $names]} { lappend names {} } return $names } proc add_root_to_db {name} { global statis inames istats iqueue dbenv puts stderr "Adding root named [list $name]..." # Note there is no validation whatsoever of root names # as far as symlinks are concerned. cd $name set name [pwd] file lstat . root_stats set root_id $root_stats(ino) ## puts -nonewline stderr "txn..." set txn {$dbenv txn} ## puts -nonewline stderr "Writing inames..." $inames put $root_id [list $name ROOT]; # -txn $txn $iqueue put [expr rand()] [list $root_stats(dev) $root_stats(ino)] ; # -txn $txn # Commit ## puts -nonewline stderr "commit..." # $txn commit set txn {} puts stderr "done." } proc check_inode {name dev ino} { regsub {^[^/]} $name ./& name file lstat $name stat if {$stat(dev) != $dev || $stat(ino) != $ino} { set rv "check_inode: $name: expected dev=$dev ino=$ino, got dev=$stat(dev) ino=$stat(ino)" puts stderr $rv error $rv } } proc careful_cd {dir dev ino} { cd $dir check_inode . $dev $ino } proc unique_function {array} { global preserve_timestamps preserve_access upvar $array x set rv $x(size) if {$preserve_timestamps} { lappend rv $x(mtime) } if {$preserve_access} { lappend rv $x(uid) $x(gid) $x(mode) } return $rv } proc do_md5sum {parent_dev ino sikey txn} { global statis ihashes inames istats dbenv iqueue hashis # Find out if the ihash of ino is in cache set dbt_list [$ihashes get $ino] set ino_was_cached [llength $dbt_list] ; # -txn $txn if {$ino_was_cached} { # Return hash from cache foreach dbt $dbt_list { foreach {inode hash_sikey} $dbt { foreach {hash db_sikey} $hash_sikey { ## puts stderr "$hash (cached) (ino=[list $ino] sikey=[list $sikey])" } return $hash } } } else { # Iterate through possible names until we find a winner foreach other_name [lookup_iname $ino $txn] { ## puts -nonewline stderr " named $other_name..." set code [catch { # Check that file hasn't sneaked away from us check_inode $other_name $parent_dev $ino # Get the hash... set exec_rv [exec md5sum < $other_name] quiet puts stderr "$exec_rv $other_name (ino=[list $ino] sikey=[list $sikey])" regexp {^[a-fA-F0-9]{32}} $exec_rv hash } result] # If there's an error report it, otherwise record hash and stop if {$code} { puts stderr "md5sum: $other_name: $result ($::errorCode)\n$::errorInfo" return {} } else { # Record inode -> hash (clobber) $ihashes put $ino [list $hash $sikey]; # -txn $txn # Record hash -> inode (no clobber) $hashis put -nooverwrite [list $hash $sikey] $ino; # -txn $txn return $hash } } } } proc fill_db {} { global statis ihashes inames istats dbenv iqueue hashis global preserve_zeros global errorCode errorInfo set more_to_do 1 set consecutive_errors 0 while {$more_to_do && $consecutive_errors < 100} { set code [catch { set parent_name {} set txn {$dbenv txn} set iqc [$iqueue cursor] ; # -update, or -txn $txn] catch { set dbt_list [$iqc get -first] $iqc del } $iqc close # $txn commit set txn {} if {[llength $dbt_list]} { foreach dbt $dbt_list break foreach {key master_item} $dbt break foreach {parent_dev parent_ino} $master_item break set txn {$dbenv txn} set parent_name [lindex [lookup_iname $parent_ino $txn] 0] # $txn abort set txn {} quiet puts stderr "Entering $parent_name ($master_item)" careful_cd $parent_name $parent_dev $parent_ino set parent_id $parent_ino foreach child_name [glob -nocomplain * .*] { if {[regexp {^\.\.?$} $child_name]} { continue } # Create a new child record (or update an old one) set code [catch { set txn {} regsub {^[^/]} $child_name ./& escaped_child_name file lstat $escaped_child_name child_stats ## puts stderr "Stat $child_name:" # parray child_stats set child_id $child_stats(ino) set child_stati_key [unique_function child_stats] # Identify interesting children switch -exact $child_stats(type) { directory { # add {device inode} to queue # Actually we have to do this after we have entered the filename, sorry... set skip_child 0 set merge_child 0 } file { # Consider {name inode deferred-hash key} for later processing if {! ($child_stats(size) == 0 && $preserve_zeros) } { set skip_child 0 set merge_child 1 } else { set skip_child 1 } } default { # Not a directory or file, have nothing to do with it set skip_child 1 } } } result] if {$code} { puts stderr "$child_name: $::errorCode: $errorInfo" if {[string length $txn]} { puts -nonewline stderr "Aborting $txn: " set rv {$txn abort} puts stderr "$rv." } } # Is the child worth indexing? if {$skip_child} continue # ...then some DB operations set code [catch { ## puts -nonewline stderr "txn..." set txn {$dbenv txn} # Note ## puts -nonewline stderr "Checking $child_id in istats..." # If there is an istat, and it has not changed, abort transaction. set istat_changed 0 set istat_found 0 foreach dbt [$istats get $child_id] { # -txn $txn -rmw foreach {key value} $dbt { if {[string compare $value $child_stati_key]} { # Remove value corresponding to previous entry in statis... # ...only if the inode matches! foreach dbt [$statis get $value] { foreach {si_key si_value} $dbt { if {![string compare $si_value $child_id]} { puts -nonewline stderr "Invalidating stati $value..." puts stderr [$statis del $value]; # -txn $txn } } } # ...and hashis. foreach hash_dbt [$ihashes get $child_id] { foreach {hash_inode hash_stati_key} $hash_dbt { foreach dbt [$hashis get $hash_stati_key] { foreach {hi_key hi_value} $dbt { if {![string compare $hi_value $child_id]} { puts -nonewline stderr "Invalidating hashi $hi_value..." puts stderr [$hashis del $hash_stati_key]; # -txn $txn } } } } } # Change is good. You first. set istat_changed 1 } else { # Found istats, but it was not different. set istat_found 1 } } } if {$istat_changed || !$istat_found} { # Write new istat ## puts -nonewline stderr "Writing istats [list $child_id $child_stati_key]..." $istats put $child_id $child_stati_key; # -txn $txn # Remove the corresponding key in ihashes (if any) to force # a new hash to be generated later. ## puts -nonewline stderr "Invalidating ihash $child_id..." $ihashes del $child_id ; # -txn $txn } else { ## puts -nonewline stderr "Keeping cached istats [list $child_id $child_stati_key]..." } # Next: add the name for this child_id to the inames db # Read the inames entry for this inode. If there exists # an entry for this inode, we clobber that as we only need # one name for each inode, and we might as well keep the # most up-to-date one. set inames_record [list $child_name $parent_id] $inames put $child_id $inames_record; # -txn $txn # OK, now if the child was a directory, add it to queue if {![string compare $child_stats(type) directory]} { $iqueue put [expr rand()] [list $parent_dev $child_stats(ino)]; # -txn $txn } # $txn commit set txn {} ## puts stderr "done." } result] if {$code} { puts stderr "$child_name: $::errorCode: $errorInfo" if {[string length $txn]} { puts -nonewline stderr "Aborting $txn: " set rv {$txn abort} puts stderr "$rv." } } if {!$merge_child} continue # Merge the file we have just entered ## puts stderr "File: child_name child_id child_stati_key [list $child_name $child_id $child_stati_key]" set txn {} set outer_code [catch { set txn {$dbenv txn} # Check statis to see if we have seen this child_stati_key before set result [$statis put -nooverwrite $child_stati_key $child_id] ; # -txn $txn if {[string match DB_KEYEXIST:* $result]} { debug puts stderr "istat=[list $child_id $child_stati_key] exists (stati=[$statis get $child_stati_key]), populating hash table now" # Go through all inodes listed at child_stati_key. Make sure they have corresponding hashis. foreach dbt [$statis get $child_stati_key] { # -txn $txn -rmw # Part 1: generate hashes for "other" inode set non_us_seen 0 foreach {key other_child_id} $dbt { debug puts -nonewline stderr "Chasing inode $other_child_id..." if {$other_child_id == $child_id} { debug puts stderr "...is us!" continue } set non_us_seen 1 do_md5sum $parent_dev $other_child_id $child_stati_key $txn } if {!$non_us_seen} { continue } # Part 2: generate hashes for us debug puts -nonewline stderr "Chasing new inode $child_id..." set our_hash [do_md5sum $parent_dev $child_id $child_stati_key $txn] # Uhhh...oops? if {![string length $our_hash]} { continue } # Part 3: dupemerge debug puts -nonewline stderr "dupemerge [list $our_hash $child_stati_key]..." foreach dbt [$hashis get [list $our_hash $child_stati_key]] { foreach {key other_child_id} $dbt { debug puts -nonewline stderr "checking $dbt against [list $child_id]..." if {![string compare $other_child_id $child_id]} { debug puts stderr "Same inode, skipping" continue } debug puts stderr "Merging inode $other_child_id with $child_id." set inode1 $other_child_id set inode2 $child_id set names1 [lookup_iname $inode1 $txn] set names2 [lookup_iname $inode2 $txn] debug puts "Merging [list $names1] with [list $names2]" if {![llength $names1] || ![llength $names2]} { puts stderr "BUG: llength of [list $names1] or [list $names2] is 0!" continue } # Foreach name1 foreach name1 $names1 { ## puts stderr "Checking that [list $name1] exists" # Check that it exists if {[catch {check_inode $name1 $parent_dev $inode1}]} { puts stderr "[list $name1]: not found: $::errorInfo" continue } ## puts stderr "[list $name1] exists" # Foreach name2 foreach name2 $names2 { # Check that name2 exists ## puts stderr "Checking that [list $name2] exists" if {[catch {check_inode $name2 $parent_dev $inode2}]} { puts stderr "[list $name2]: not found: $::errorInfo" continue } ## puts stderr "[list $name2] exists" # Try to link set code [catch { quiet puts stderr [list cmp $name1 $name2] exec cmp $name1 $name2 # FIXME: What, no 'file link' in Tcl? Argh! quiet puts stderr [list ln -vf $name1 $name2] exec ln -vf $name1 $name2 2>@stderr >@stdout # inode2 is now gone. Maybe. Leave the entries specific to # inode2 in the cache in case we see inode2 again, but update # hashis/statis. # Update hashis and statis to refer to the inode we kept. # This time, we overwrite... $hashis put [list $our_hash $child_stati_key] $inode1 $statis put $child_stati_key $inode1 ### foreach dbt [$inames get $inode1] { ### foreach {inode name_parent} $dbt { ### $inames put $inode2 $name_parent ### } ### } } result] if {$code} { puts stderr "oops: $result ($::errorCode)\n$::errorInfo" } } } } } debug puts stderr "done." } } # $txn commit set txn {} } result] if {$outer_code} { puts stderr "[file join $parent_name $child_name]: $result ($::errorCode)\n$::errorInfo" if {[string length $txn]} { puts -nonewline stderr "Aborting $txn: " set rv {$txn abort} puts stderr "$rv." } } } } else { set more_to_do 0 } } result] # Break? if {$code} { puts stderr "$parent_name: $result ($::errorCode)\n$::errorInfo" incr consecutive_errors } else { set consecutive_errors 0 } } } puts stderr "Start building directory tree..." foreach dir $directories { puts stderr "...adding $dir" add_root_to_db $dir } if {!$queue_only} { fill_db } foreach handle $handles_to_close { puts -nonewline stderr "Closing $handle: " puts stderr "[$handle close]." } exit 0