Contents
- INDEX
- HP-UX Index
- Solaris
- Linux Index
- Other Unix types
- General Unix
- Unix Networking
- Unix Scripts
- Unix databases
Associated Information
- Unix Scripts Tips
Useful Links
-
Bash shell HOWTO
- Bourne Shell Man Page
-
C Shell Guide
- Korn Shell Resources
|
example
#!/usr/bin/perl
#############################################################################
# #
#############################################################################
# #
# File: finddups #
# #
# $Id: finddups,v 1.2 1999/09/19 12:52:05 root Exp root $ #
# #
# Usage: finddups[-rdh?] [base directory] #
# #
# Description: #
# Used to find possible duplicate files within the provided search #
# tree starting from the base directory, irregardless of renames or #
# locations. #
# #
# The original version included a crude CRC32 checksum routine that #
# works but is very slow in performance. I left it in only for fun #
# and for DOS users that may not have any UNIX ports of sum, cksum, #
# or the latest theoretically unrepeatable MD5 hash function. Guess #
# a script like this will tell, now won't it! #
# #
# Please use the "diff" command to verify files are absolutely #
# identical before any removal operations are taken. #
# #
# Options: #
# -r Recursively check all sub-directories from the base dir that #
# you passed on the command line. #
# -i Use the internal CRC32 function. Only use this if nothing #
# better has been installed on your system. I would suggest #
# using the MD5 hash function. This is set below by the #
# variable $CRCFunc and will be used if no '-i' flag is passed.#
# -d Debug mode of operation, prints files CRC/hash to stdout. #
# #
# WARNING! Always verify any file that is reported as a duplicate with #
# another tool such as 'diff' before you delete it. MD5 is reported to #
# be infinitely impossible to repeat but so was.... You get the idea! #
# #
# Author: Michael A. Gumienny #
# #
# Written: 1995, Re-written 1999 #
# #
#############################################################################
#############################################################################
# #
# User modifiable variable definitions: #
# #
# NOTE: NOTE: NOTE: NOTE: NOTE: NOTE: NOTE: NOTE: NOTE: #
# #
# You will more than likely use one of the following native functions of #
# the UNIX or ported Windows functions now. These and similar CRC/hash #
# functions typically return a line similar to the following: #
# #
# (file checksum/hash value) (file size) (file name) #
# #
# This script has been hard coded to ignore anything other than the first #
# field of returned data which is the actual CRC/hash value that we need. #
# If you need to modify the script to accept a different functions return #
# value, you can do so at about line 176 of this script. #
#############################################################################
#
# Uncomment the function you wish to use.
#$CRCFunc = "/usr/bin/sum";
#$CRCFunc = "/usr/bin/cksum";
$CRCFunc = "/usr/bin/md5sum";
#############################################################################
# #
# Non User modifiable variable definitions: #
# #
#############################################################################
# These won't be used unless you choose to use the internal CRC-32 function.
# Several polynomial variants seem to exist, but this is the most common one
# that I found in use...
$Polynomial =0xedb88320; # [PkZip, Autodin II, Ethernet, FDDI]
#############################################################################
# &Help; #
# This routine explains brief usage syntax to STDOUT. The program is then #
# terminated. #
#############################################################################
#
sub Help
{
printf("Usage:\tfinddups [-rd] directory\n");
printf("\tUsed to find duplicate files irregardless of filename or location.\n\n");
printf("\tOptions:\n");
printf("\t-r\tRecursively check all sub-directories also.\n");
printf("\t-i\tUse internal CRC32 function only if nothing better installed.\n");
printf("\t-d\tdebug mode, print file CRC/hash values to stdout.\n");
exit;
}
#############################################################################
# getdir (root, recursiveFlag); #
# Routine to gather filenames within a directory, with an optional recursive#
# flag. #
#############################################################################
#
sub getdir
{
local($rootdir, $r)=@_;
opendir(DIR, $rootdir) || die "No can do...\n";
foreach (sort readdir(DIR))
{
next if (/^\.\.?$/);
$filename = $_;
$filename = "$rootdir/$filename";
# root directory gets a double slash prepended so we clean it up.
$filename =~ s/\/\//\//;
if (!-d $filename)
{
if($internal)
{
# User blindly trust the internal CRC32 Function...
# WARNING! Some files can and will report identical
# CRC32 checksums. a better method is to use the
# newer MD5 hash function. In theory no two files
# will return an identical MD5 sum. You have been warned!
$filecrc = &GenCRC($filename);
}
else
{
# User wants to use the systems better CRC/hash functions.
if ( !open(IN,"$CRCFunc '$filename' |") )
{
printf("Unable to read $CRCFunc or $filename for CRC generation.\n");
exit;
}
# Get the return value of the CRC/hash function in $filecrc
$filecrc = ;
# Chop the off the end, do I trust the chop function?
# depends on the OS...
$filecrc =~ s/\n$//;
close(IN);
}
# NOTE: This next line is where you may have to do some
# changes for future compatabilitiy, system differences, etc.
# Most sum/cksum and now md5sum functions return the crc/hash
# value as the first field of its return value. If your
# crc/hash function that you choose to use is different, then
# change the following line to suit your needs accordingly...
# Some example methods might be
# ($filecrc, $junk) = split(" ", $filecrc);
# ($junk, $filecrc, $junk) = split(" ", $filecrc);
($filecrc)=split(" ", $filecrc);
# END OF POSSIBLE MODIFICATIONS SECTION
# The rest of the script should continue to function properly
push(@Array, join("\t", $filecrc, $filename));
if($debug) { print "File: $filename $filecrc\n"; }
}
if ((-d "$filename" && !-l "$filename") && ($r)) { &getdir("$filename", 1); }
}
close(DIR);
}
#############################################################################
# This routine generates the CRC32 polynomial table when called. #
#############################################################################
#
sub GeneratePolyTable
{
# Generate our polynomial table
# 0x7fffffff compensates for signed integers within PERL
for ($i=0; $i < 256; $i++)
{
$crc_accum = ( $i );
for ($j=8; $j>0; $j--)
{
if ($crc_accum & 0x00000001)
{ $crc_accum = (($crc_accum >> 1) & 0x7fffffff) ^ $Polynomial; }
else
{ $crc_accum = ($crc_accum >> 1) & 0x7fffffff; }
}
@crc_table[$i] = $crc_accum;
}
}
#############################################################################
# $x=&GenCRC($filename); #
# Routine generates a CRC32 sum for the given $filename. #
#############################################################################
#
sub GenCRC
{
local($filename) = @_;
local($rpt_size, $act_size, $buffer, $i, $len);
local($crc) = 0xffffffff; # initial CRC value;
$rpt_size = (stat($filename))[7];
if ( !open(IN,"< $filename") )
{
printf("Unable to read $filename for CRC generation.\n");
exit;
}
binmode(IN); # Added for DOS users...
if ( !defined($act_size = read(IN, $buffer, $rpt_size)) )
{
printf("Can't read all of $filename.\n");
exit;
}
if ( $rpt_size != $act_size )
{ warn "Bytes read does != 'stat' size\n"; }
$len = length($buffer); # length of buffer whose crc will be checked
for ($i = 0; $i < $len; $i++ )
{
$index = (( $crc ^ ord(substr($buffer,$i,1)) ) & 0xff);
$crc = ( ( $crc >> 8 ) & 0x00ffffff ) ^ @crc_table[$index];
}
return ($crc ^ 0xffffffff); # XOR the return value
}
#############################################################################
# Main routine begins. #
#############################################################################
#
if($#ARGV==-1) { &Help; } # Help the user with the syntax
if(($#ARGV==0) && (@ARGV[0] !~ /^-/))
{ $dir = shift(@ARGV); }
else
{
foreach $arg (@ARGV)
{
if ($arg =~ /^-/)
{
if ($arg =~ /r/) { $recurse = 1; }
if ($arg =~ /d/) { $debug = 1; }
if ($arg =~ /i/) { $internal = 1; &GeneratePolyTable; }
if ($arg =~ /\?/){ &Help; }
if ($arg =~ /\h/){ &Help; }
}
}
shift;
$dir = shift(@ARGV);
}
# If no start directory was given, use the current as our base
$dir = "." unless $dir;
&getdir($dir, $recurse);
undef($Flag);
foreach $line (sort(@Array))
{
($CurrentCRC, $CurrentFile) = split("\t", $line);
if (($CurrentCRC eq $LastCRC) && (!$Flag))
{
++$Flag; #=1;
printf("--- Possible Duplicates ---\n");
printf(" $LastFile\n");
}
if($Flag)
{
if ($CurrentCRC eq $LastCRC) { printf(" $CurrentFile\n"); }
else { printf("===========================\n"); undef($Flag); }
}
$LastFile = $CurrentFile;
$LastCRC = $CurrentCRC;
}
#### END OF SCRIPT
|