#!/usr/bin/perl # despite a nice short script h ere, we could run out of filehandles # quite easily # two ways - expire oldest filehandles when needed # - expire all filehandles every N lines (n ~ 1000) # (depending on how popular each currnely open file is) # (C)2000-3 by Ken Chasse # author: math@sizone.org # # you are allowed to use this according to the GPL *AND* you must # notify me of your use of it, and post changes on the web or back to me # and notify me of such changes every time they're issued/changed. # v0.010 28may00 - basic function # v0.020 11jun00 - added -l and -n options # v0.021 12jun00 - changed -l to -p (previous) in anticipation of prev/next # and -n to -c # v0.030 09feb03 - detects symlinks pointing to the same file and uses one # filehandle only # - added -nd to indicate to NOT create dirs (backward compat) # - added -v for verbose output (reports # of lines processed) # v.040 19feb03 - rewrote file creation routines & duplicate file detect # - added -D for copious DEBUG output # v.042 16nov03 - added -m to not include matching lines in the output # (useful for having delimeters between records not included # in the output files) $version="v0.042 16nov03"; $base=`basename $0`; chop $base; ($ARGV[0] eq "-h" || $ARGV[0] eq "--help" ) && die <<"EOF"; $base $version by Ken Chase math\@sizone.org Usage: $base (opts) (pattern) (filename macro) (files) will split a file line by line into new files, names based on the filename macro (raw eval'd perl code) which can refer to the regexp pattern the file was split on (ie use \$1 style backrefs in the filename). Symlinks pointing at the same files are handled properly (by checking device:inode pairs and using same filehandle for symlink and natural file) pattern must have a () glob in the regexp to give \$1 backreference as a file to write each matching line to (or not, if you prefer, but you wont get the dynamic behaviour this is designed to provide). filename format is eval'd so can refer to \$1..\$9 in the filename. backrefs (\$1..\$9) have / changed to : before eval of filename macro to avoid '/etc/passwd' filename hacking. (Filename macro itself, however, may contain /s supplied by you on command line.) use the --IM_A_STUPID_MORON switch to turn this intelligent safety off. options: -a append files instead of overwriting them -m dont include matching line in target file (ie elide multi-line line records' delimeter lines) -p non matching lines go into file named by previous match -c [name] non matching lines go into catchall file ("_unmatched_" unless specified) -nd DO NOT create dirs as spec'd in filename macro -v verbose output - show 1000s of lines input processed -e [name] errors (output to non existen files/dirs) out to this file (default "_errors_" unless specified) -D some debug output using -p and -c together makes all unmatched lines before THE FIRST MATCH go into the _unmatched_ file. using -nd while trying to create dirs in filename macro (ie /'s in filename) will result in no file being created (and no error generated) eg: $base ' (?:www\\.)?(.*)\$' '"weblogs/\$1"' access_log will split the access_log file based on the last word of each line (in this case the website accessed) and write the lines into a file in the dir 'weblogs/' in file named by the domain (only, not including the www. part). $base -p '
' '"foo/\$1.html"' will split an HTML file's DIVision tags into seperate files in the foo/ dir, each named by the DIV ID html code. Text in between each DIV ID will go into the file named by the last DIV ID matched. Remember to escape your pattern $'s and target filename code with 's or \\s to protect \$ \(\) \' and other characters. Remember that the filename 'foo/bar/baz/\$1/\$2' will evaluate in perl to an error or to null. foo/bar/baz is a mathematical statement at best, not a filename. '"foo/bar/baz/\$1/\$2"' is, however, a valid perl string and filename. \$a contains the name of the last file if you want to refer to it in your filename macro. keys(%f) is the list of existing filenames up to this point. EOF ############################################################################## use FileHandle; #::Multi; # need file handle pooling use Getopt::Long; # parse cmd line options $Getopt::Long::ignorecase = 0; # we dont want to ignore option case $result = GetOptions ("a", "m", "p", "e:s", "c:s", "nd", "v", "D", "IM_A_STUPID_MORON",); ($opt_D) && (print STDERR "opt_v: '$opt_v'\n"); ($opt_D) && (print STDERR "opt_c: '$opt_c'\n"); ($opt_D) && (print STDERR "opt_m: '$opt_m'\n"); ($opt_D) && (print STDERR "opt_p: '$opt_p'\n"); ($opt_D) && (print STDERR "opt_e: '$opt_p'\n"); $append{""} = ">"; # non append (overwrite) $append{"1"} = ">>"; # append mode $errfile = defined($opt_e); # error file as supplied ($opt_D) && (print STDERR "errfile: $errfile\n"); $pat = shift @ARGV; # pattern $targ = shift @ARGV; # target file macro ($opt_D) && (print "pat:'$pat'\ntarg:'$targ'\n\n"); $targ =~ s/\$([1-9]\d*)/\$b[\1]/g; # replace $1 with $b[1] in eval code $mode = 0777; # default dir create mode, apply umask while (<>) { # read stdin $l = $_; $match = 0; chop; $ln++; # store lastline 0 matchflag ++lcount ($opt_D) && (print STDERR "="x78, "\nIN ", $ln, ": $_\n"); if (@b=(/$pat/o)) { $match++; @b=('error::you_shouldnt_see_this',@b); # sync \1 with $b[1] if (! $opt_IM_A_STUPID_MORON) { map { s#/#:#g } @b }; #repl / w. : in @b $a = eval($targ); # eval filename macro } ($opt_D) && (print STDERR "line matched: $match\t cur matches: |", join("|", @b), "|\ncur filename: |$a|\n"); if ($opt_c && ! $match) { # send non match to catchall ($a = $opt_c ||= "_unmatched_"); } if ($opt_c || $opt_p || $match) { # opt_p just goes to cur filehandle $a ||= "$0.nomatch.$$.$1"; # default filename if none @d = split("/",$a); pop @d; $dd=''; # get dirs, ensure they exist map {($dd .= $_ . "/") && (! -d $dd ) && !$opt_nd && mkdir ($dd,$mode)} @d; if (! $f{$a}) { # if no file of this name yet ($opt_D) && (print STDERR "** fstat() '$a': "); if ($f{$a} = new FileHandle ">>$a") { # open handle to new file @s = stat($f{$a}); $di = $s[0].":".$s[1]; # collect dev/inode of file $f{$a}->close; # close handle } else { # cant get new filehandle!! if (! $errfile) { # error file? ($opt_D) && (print STDERR " (no file, no error catch file)\n"); next; # no file to output to, skip } else { # log error to errorfile ($opt_D) && (print STDERR " --> errorfile $opt_e"); ($a = $opt_e) || ($a = "$0.errors.$$.$1"); # set error filename if (! $f{$a} ) { # if no errorfile FH already $f{$a} = new FileHandle $append{$opt_a} . "$a"; # get FH to errfile @s = stat($f{$a}); $di = $s[0].":".$s[1]; # get dev:inode too $f{$a}->close; $di{$a} = $di; # set FH hashes } $di = $di{$a}; # get dev:inode of error file } } ($opt_D) && (print STDERR " $di\n"); if ($fdi{$di}) { # used this dev/inode? $f{$a} = $fdi{$di}; # reuse filehandle if same ($opt_D) && (print STDERR " same file as ", $fn{$di}, "\n"); } else { # never seen it, so new FH $f{$a} = new FileHandle $append{$opt_a} . $a; # get new filehandle @s = stat($f{$a}); $di = $s[0].":".$s[1]; # get dev:inode too } $fdi{$di} = $f{$a}; $di{$a} = $di; # FH by dev/inode, d/i by fname $fn{$di} = $a; # filename by device:inode } $di = $di{$a}; # short form is handy if (! $match || ($match && ! $opt_m)) { # unmatched, so appendline $l{$di} .= $l; $c{$di}++; ($opt_D) && (print STDERR " added to '$a'\n"); }; if ($c{$di} > 10000) { # >10000 lines, flush buffers ($opt_D || opt_v) && (print STDERR " 10000 lines >> '$a'\n"); $f{$a}->print($l{$di}); $c{$di}=0; $l{$di} = ""; # flush 2 file, 0 bufs } } $bytes += length($_); # track bytes read if (! ($ln % 10000) && ($opt_v || $opt_D)) { # show progress print STDERR "Lines: $ln Bytes: $bytes\n"; } } # dump leftover buffer lines for $b (keys %l) { # go thru all filehandles if ($opt_D || $opt_v) { print STDERR "emptying buffer for $fn{$b}", " (dev:inode $b / ", length($l{$b}), " bytes)\n"; } $fdi{$b} && $l{$b} && $fdi{$b}->print($l{$b}) # dump to file }