Please see this


- HP-UX Index
- Solaris
- Linux Index
- Other Unix types
- General Unix
- Unix Networking
- Unix Scripts
- Unix databases

Associated Information

- Unix Scripts Tips

Useful Links

- Bash shell HOWTO
- Bourne Shell Man Page
- C Shell Guide
- Korn Shell Resources


#!/usr/bin/perl ############################################################################# # # ############################################################################# # # # File: finddups # # # # $Id: finddups,v 1.2 1999/09/19 12:52:05 root Exp root $ # # # # Usage: finddups[-rdh?] [base directory] # # # # Description: # # Used to find possible duplicate files within the provided search # # tree starting from the base directory, irregardless of renames or # # locations. # # # # The original version included a crude CRC32 checksum routine that # # works but is very slow in performance. I left it in only for fun # # and for DOS users that may not have any UNIX ports of sum, cksum, # # or the latest theoretically unrepeatable MD5 hash function. Guess # # a script like this will tell, now won't it! # # # # Please use the "diff" command to verify files are absolutely # # identical before any removal operations are taken. # # # # Options: # # -r Recursively check all sub-directories from the base dir that # # you passed on the command line. # # -i Use the internal CRC32 function. Only use this if nothing # # better has been installed on your system. I would suggest # # using the MD5 hash function. This is set below by the # # variable $CRCFunc and will be used if no '-i' flag is passed.# # -d Debug mode of operation, prints files CRC/hash to stdout. # # # # WARNING! Always verify any file that is reported as a duplicate with # # another tool such as 'diff' before you delete it. MD5 is reported to # # be infinitely impossible to repeat but so was.... You get the idea! # # # # Author: Michael A. Gumienny # # # # Written: 1995, Re-written 1999 # # # ############################################################################# ############################################################################# # # # User modifiable variable definitions: # # # # NOTE: NOTE: NOTE: NOTE: NOTE: NOTE: NOTE: NOTE: NOTE: # # # # You will more than likely use one of the following native functions of # # the UNIX or ported Windows functions now. These and similar CRC/hash # # functions typically return a line similar to the following: # # # # (file checksum/hash value) (file size) (file name) # # # # This script has been hard coded to ignore anything other than the first # # field of returned data which is the actual CRC/hash value that we need. # # If you need to modify the script to accept a different functions return # # value, you can do so at about line 176 of this script. # ############################################################################# # # Uncomment the function you wish to use. #$CRCFunc = "/usr/bin/sum"; #$CRCFunc = "/usr/bin/cksum"; $CRCFunc = "/usr/bin/md5sum"; ############################################################################# # # # Non User modifiable variable definitions: # # # ############################################################################# # These won't be used unless you choose to use the internal CRC-32 function. # Several polynomial variants seem to exist, but this is the most common one # that I found in use... $Polynomial =0xedb88320; # [PkZip, Autodin II, Ethernet, FDDI] ############################################################################# # &Help; # # This routine explains brief usage syntax to STDOUT. The program is then # # terminated. # ############################################################################# # sub Help { printf("Usage:\tfinddups [-rd] directory\n"); printf("\tUsed to find duplicate files irregardless of filename or location.\n\n"); printf("\tOptions:\n"); printf("\t-r\tRecursively check all sub-directories also.\n"); printf("\t-i\tUse internal CRC32 function only if nothing better installed.\n"); printf("\t-d\tdebug mode, print file CRC/hash values to stdout.\n"); exit; } ############################################################################# # getdir (root, recursiveFlag); # # Routine to gather filenames within a directory, with an optional recursive# # flag. # ############################################################################# # sub getdir { local($rootdir, $r)=@_; opendir(DIR, $rootdir) || die "No can do...\n"; foreach (sort readdir(DIR)) { next if (/^\.\.?$/); $filename = $_; $filename = "$rootdir/$filename"; # root directory gets a double slash prepended so we clean it up. $filename =~ s/\/\//\//; if (!-d $filename) { if($internal) { # User blindly trust the internal CRC32 Function... # WARNING! Some files can and will report identical # CRC32 checksums. a better method is to use the # newer MD5 hash function. In theory no two files # will return an identical MD5 sum. You have been warned! $filecrc = &GenCRC($filename); } else { # User wants to use the systems better CRC/hash functions. if ( !open(IN,"$CRCFunc '$filename' |") ) { printf("Unable to read $CRCFunc or $filename for CRC generation.\n"); exit; } # Get the return value of the CRC/hash function in $filecrc $filecrc = <IN>; # Chop the <CR> off the end, do I trust the chop function? # depends on the OS... $filecrc =~ s/\n$//; close(IN); } # NOTE: This next line is where you may have to do some # changes for future compatabilitiy, system differences, etc. # Most sum/cksum and now md5sum functions return the crc/hash # value as the first field of its return value. If your # crc/hash function that you choose to use is different, then # change the following line to suit your needs accordingly... # Some example methods might be # ($filecrc, $junk) = split(" ", $filecrc); # ($junk, $filecrc, $junk) = split(" ", $filecrc); ($filecrc)=split(" ", $filecrc); # END OF POSSIBLE MODIFICATIONS SECTION # The rest of the script should continue to function properly push(@Array, join("\t", $filecrc, $filename)); if($debug) { print "File: $filename $filecrc\n"; } } if ((-d "$filename" && !-l "$filename") && ($r)) { &getdir("$filename", 1); } } close(DIR); } ############################################################################# # This routine generates the CRC32 polynomial table when called. # ############################################################################# # sub GeneratePolyTable { # Generate our polynomial table # 0x7fffffff compensates for signed integers within PERL for ($i=0; $i < 256; $i++) { $crc_accum = ( $i ); for ($j=8; $j>0; $j--) { if ($crc_accum & 0x00000001) { $crc_accum = (($crc_accum >> 1) & 0x7fffffff) ^ $Polynomial; } else { $crc_accum = ($crc_accum >> 1) & 0x7fffffff; } } @crc_table[$i] = $crc_accum; } } ############################################################################# # $x=&GenCRC($filename); # # Routine generates a CRC32 sum for the given $filename. # ############################################################################# # sub GenCRC { local($filename) = @_; local($rpt_size, $act_size, $buffer, $i, $len); local($crc) = 0xffffffff; # initial CRC value; $rpt_size = (stat($filename))[7]; if ( !open(IN,"< $filename") ) { printf("Unable to read $filename for CRC generation.\n"); exit; } binmode(IN); # Added for DOS users... if ( !defined($act_size = read(IN, $buffer, $rpt_size)) ) { printf("Can't read all of $filename.\n"); exit; } if ( $rpt_size != $act_size ) { warn "Bytes read does != 'stat' size\n"; } $len = length($buffer); # length of buffer whose crc will be checked for ($i = 0; $i < $len; $i++ ) { $index = (( $crc ^ ord(substr($buffer,$i,1)) ) & 0xff); $crc = ( ( $crc >> 8 ) & 0x00ffffff ) ^ @crc_table[$index]; } return ($crc ^ 0xffffffff); # XOR the return value } ############################################################################# # Main routine begins. # ############################################################################# # if($#ARGV==-1) { &Help; } # Help the user with the syntax if(($#ARGV==0) && (@ARGV[0] !~ /^-/)) { $dir = shift(@ARGV); } else { foreach $arg (@ARGV) { if ($arg =~ /^-/) { if ($arg =~ /r/) { $recurse = 1; } if ($arg =~ /d/) { $debug = 1; } if ($arg =~ /i/) { $internal = 1; &GeneratePolyTable; } if ($arg =~ /\?/){ &Help; } if ($arg =~ /\h/){ &Help; } } } shift; $dir = shift(@ARGV); } # If no start directory was given, use the current as our base $dir = "." unless $dir; &getdir($dir, $recurse); undef($Flag); foreach $line (sort(@Array)) { ($CurrentCRC, $CurrentFile) = split("\t", $line); if (($CurrentCRC eq $LastCRC) && (!$Flag)) { ++$Flag; #=1; printf("--- Possible Duplicates ---\n"); printf(" $LastFile\n"); } if($Flag) { if ($CurrentCRC eq $LastCRC) { printf(" $CurrentFile\n"); } else { printf("===========================\n"); undef($Flag); } } $LastFile = $CurrentFile; $LastCRC = $CurrentCRC; } #### END OF SCRIPT

Copyright 2000 Intronet Computers Ltd
Email: Intronet Computers for enquiries