From 5722e4fa7fe4c8f2378c0777e49f753a15879644 Mon Sep 17 00:00:00 2001 From: gdr Date: Sun, 28 Jan 1996 00:52:48 +0000 Subject: [PATCH] version 1.0 of msort and dsort for GNO --- usr.bin/sort/README | 18 +++ usr.bin/sort/common.h | 57 ++++++++ usr.bin/sort/disksort.c | 262 ++++++++++++++++++++++++++++++++++++ usr.bin/sort/dsort.1 | 130 ++++++++++++++++++ usr.bin/sort/dsort.c | 207 ++++++++++++++++++++++++++++ usr.bin/sort/initdisksort.c | 105 +++++++++++++++ usr.bin/sort/linecount.c | 89 ++++++++++++ usr.bin/sort/loadarray.c | 121 +++++++++++++++++ usr.bin/sort/makefile.mk | 55 ++++++++ usr.bin/sort/mergeone.c | 169 +++++++++++++++++++++++ usr.bin/sort/msort.1 | 1 + usr.bin/sort/msort.c | 155 +++++++++++++++++++++ usr.bin/sort/sortarray.c | 62 +++++++++ usr.bin/sort/tempnam.c | 130 ++++++++++++++++++ 14 files changed, 1561 insertions(+) create mode 100644 usr.bin/sort/README create mode 100644 usr.bin/sort/common.h create mode 100644 usr.bin/sort/disksort.c create mode 100644 usr.bin/sort/dsort.1 create mode 100644 usr.bin/sort/dsort.c create mode 100644 usr.bin/sort/initdisksort.c create mode 100644 usr.bin/sort/linecount.c create mode 100644 usr.bin/sort/loadarray.c create mode 100644 usr.bin/sort/makefile.mk create mode 100644 usr.bin/sort/mergeone.c create mode 100644 usr.bin/sort/msort.1 create mode 100644 usr.bin/sort/msort.c create mode 100644 usr.bin/sort/sortarray.c create mode 100644 usr.bin/sort/tempnam.c diff --git a/usr.bin/sort/README b/usr.bin/sort/README new file mode 100644 index 0000000..445160a --- /dev/null +++ b/usr.bin/sort/README @@ -0,0 +1,18 @@ +This archive contains the utilities msort(1) and dsort(1). Both sort +text files lexicographically. + +Msort is a fast in-place memory sort. + +Dsort is a disk based sort that can handle "arbitrarily large" files +(in reality, limited to ULONG_MAX -- 4 294 967 295 -- bytes). + +The big difference between these sorts and the previously available +sort(1) is that these won't crash your system ... if either run into +problems, they exit gracefully and (if you are using the verbose flag) +tell you what the problem is. + +Enjoy. + +Devin Reade +14 June 1994 + diff --git a/usr.bin/sort/common.h b/usr.bin/sort/common.h new file mode 100644 index 0000000..c38c978 --- /dev/null +++ b/usr.bin/sort/common.h @@ -0,0 +1,57 @@ +#include +#include + +#ifdef DEFFUNC +# define EXTERN +#else +# define EXTERN extern +#endif + +#define ALN2I 1.442695022 /* 1 / ln(2) */ +#define TINY 1.0e-5 /* "zero" for heapsort */ +#define BUFFERSIZE 4096 /* a generic buffer for I/O */ +#define DEFAULT_LINECOUNT 1000 /* number of lines to memory sort */ +#define DEFAULT_LINELENGTH 512 /* max length of line recognised */ +#define DELIM 0x03 /* ETX */ + +#ifdef __ORCAC__ +# define NEWLINE '\r' +#else +# define NEWLINE '\n' +# define BROKEN_REALLOC +#endif + +#ifdef __GNUC__ + int printf(char *format, ...); + int fprintf(FILE *stream, char *format, ...); + void perror(char *s); + int close(int fd); + int fclose(FILE *stream); + int rename(char *, char *); + void rewind(FILE *); +#endif + +#ifdef DEBUG +# define STATUS(string) fprintf(stderr,"%s\n",string) + extern void begin_stack_check(void); + extern int end_stack_check(void); +#else +# define STATUS(string) {;} +#endif + +unsigned long int linecount (char *filename, size_t *maxlinelen); +char **loadarray (unsigned long n, char *filename, size_t maxlinelen); +void sortarray(char *array[], unsigned long n); +int disksort (char *filename, size_t linecount, size_t linelength); +int initdisksort(void); +int mergeone(FILE *fpA, FILE *fpB, FILE *fpC, char strA[], char strB[], + size_t linelength); + +EXTERN short v_flag; +EXTERN FILE *out_fp; + +#ifdef DSORT +EXTERN FILE *fp1, *fp2, *fp3, *fp4; +EXTERN char *file1, *file2, *file3, *file4; +EXTERN char *tpath1, *tpath2, *tpath3, *tpath4; +#endif diff --git a/usr.bin/sort/disksort.c b/usr.bin/sort/disksort.c new file mode 100644 index 0000000..4a502b4 --- /dev/null +++ b/usr.bin/sort/disksort.c @@ -0,0 +1,262 @@ +#ifdef __CCFRONT__ +#include <14:pragma.h> +#endif + +#define DSORT +#include "common.h" + +#include +#include + +#define KILLTEMP fclose(fp1); fclose(fp2); fclose(fp3); fclose(fp4); \ + unlink(file1); unlink(file2); unlink(file3); unlink(file4); \ + free(file1); free(file2); free(file3); free(file4) +#define KILLARRAY(a,i) { \ + size_t j; \ + for (j=0;j is the name of the input file. is the + * maximum number of text lines we should try to sort in _memory_ + * at any one time. If it is zero, then DEFAULT_LINECOUNT is used. + * may be much smaller than the actual linecount of + * . - 1 is the maximum length of line from + * that disksort will recognise. If is zero, + * then DEFAULT_LINELENGTH is used. Global out_fp is an open stream. + * + * Post: The file refered to by is sorted lexicographically + * by lines. If a line is longer than , then any extra + * characters in that line will be truncated. On success, disksort + * returns zero. On failure, disksort returns -1. The sorted output + * is printed to out_fp, and is unchanged. + * + * Note: This routine is based on a polyphase merge sort using four + * temporary files. + * + * Uses Globals: + * fp1, fp2, fp3, fp4, file1, file2, file3, file4, out_fp + */ + +int disksort (char *infile, size_t linecount, size_t linelength) { + FILE *in_fp; /* input file pointer */ + char lastfile; /* to where did we last write? */ + size_t runcount; /* how many runs make up infile? */ + FILE *fpA, *fpB, *fpC, *fpD; + char *tempout; /* the name of the final temp file */ + char **array; + size_t i; + char *strA, *strB; + + /* + * + * PHASE ZERO: Initialization + * + */ + + if (initdisksort() != 0) return -1; /* already printed error msgs */ + + /* + * Open the input file for reading + */ + + if ((in_fp=fopen(infile,"r"))==NULL) { + if (v_flag) perror("disksort: couldn't open input file"); + KILLTEMP; + return -1; + } + + /* + * Get the size of the array of strings we will sort, and create it + */ + + if (linecount == 0) linecount = DEFAULT_LINECOUNT; + if (linelength == 0) linelength = DEFAULT_LINELENGTH; + if ((array = malloc (linecount * sizeof(char *))) == NULL) { + if (v_flag) perror("disksort: couldn't allocate array"); + fclose(in_fp); + KILLTEMP; + return -1; + } + for (i = 0; i records. Keep merging and + */ + + /* initialize this backwards because of the initial flip */ + fpA = fp2; + + /* get some scratch strings for the merge */ + if (((strA=malloc(linelength))==NULL) || + ((strB=malloc(linelength))==NULL)) { + if (v_flag) perror(errmsg3); + return -1; + } + + do { + runcount = 0; + + /* flip the files so we can sort back the other way */ + if (fpA == fp1) { + fpA = fp3; + fpB = fp4; + fp1 = freopen(file1,"w+",fp1); + fp2 = freopen(file2,"w+",fp2); + if ((fp1==NULL) || (fp2==NULL)) { + if (v_flag) perror(errmsg4); + return -1; + } + fpC = fp1; + fpD = fp2; + } else { + fpA = fp1; + fpB = fp2; + fp3 = freopen(file3,"w+",fp3); + fp4 = freopen(file4,"w+",fp4); + if ((fp3==NULL) || (fp4==NULL)) { + if (v_flag) perror(errmsg4); + return -1; + } + fpC = fp3; + fpD = fp4; + } + rewind(fpA); + rewind(fpB); + + /* + * Sort pairs of runs until EOF is reached + */ + for (;;) { + int mergeresult; + + /* + * merge one run from each of fpA and fpB into fpC, then repeat + * it but placing the result into fpD. + */ + + mergeresult = mergeone(fpA,fpB,fpC,strA,strB,linelength); + if (mergeresult == 0) { + runcount++; + mergeresult = mergeone(fpA,fpB,fpD,strA,strB,linelength); + if (mergeresult == 0) runcount++; + } + if (mergeresult == -1) { + /* both files at EOF */ + break; + } else if (mergeresult == -2) { + /* files in error; message already printed */ + KILLARRAY(array,linecount); + KILLTEMP; + return -1; + } + /* else normal merge; continue */ + } + } while (runcount>1); + + /* + * At this point, fpC contains the sorted file. (We hope ...) + */ + if (fpC==fp1) tempout = file1; + else if (fpC==fp2) tempout = file2; + else if (fpC==fp3) tempout = file3; + else /* fpC==fp4 */ tempout = file4; + + /* + * clean up and exit + */ + + /* copy lines from fpC to infile except for the trailing DELIM */ + rewind(fpC); + for (;;) { + if (fgets(strA,linelength,fpC)==NULL) { + if(v_flag) perror(errmsg5); + return -1; + } + if ((strA[0]==DELIM) && (strA[1]=='\n')) break; + if (fprintf(out_fp,"%s",strA)==EOF) { + if (v_flag) perror("disksort: write on output file failed"); + return -1; + } + } + + free(strA); + free(strB); + KILLARRAY(array,linecount); + KILLTEMP; + + return 0; +} + diff --git a/usr.bin/sort/dsort.1 b/usr.bin/sort/dsort.1 new file mode 100644 index 0000000..8ee2856 --- /dev/null +++ b/usr.bin/sort/dsort.1 @@ -0,0 +1,130 @@ +.TH DSORT 1 "Commands and Applications" "14 June 1994" "Version 1.0" +.SH NAME +dsort, msort \- sort text files lexicographically +.SH SYNOPSIS +.B msort +[ +.I -hvV? +] [ +.I "-o outfile" +] [ +.I "-n lines" +] +.I file1 +[ +.I "file2 ..." +] +.LP +.B dsort +[ +.I -hvV? +] [ +.I "-l length" +] [ +.I "-n lines" +] [ +.I "-o outfile" +] [\fI-t path1\fR[,\fIpath2\fR[,\fIpath3\fR[,\fIpath4\fR]]]] \fIinfile\fR +.SH DESCRIPTION +.BR dsort " and " msort +are robust text file sorting utilities. While they do not support a lot +of features, they are designed to sort large (and small) files very quickly. +.LP +.B msort +is an in-place memory sort. Since it uses the heapsort algorithm, it is +O[n lg n] both on average and for worst-case. Provided it has enough memory, +.BR msort +will sort files with lines of arbitrary length. Unless overridden by the +.I -n +flag, +.BR msort +will sort files of up to 1000 lines. Larger files can be sorted provided +there is sufficient core memory. If multiple input files are given, the +output is the concatenated result of sorting the input files separately. +Thus, the following would be equivalent: +.LP +.nf + % msort file1 file2 file3 >outfile +and + % msort file1 >file1out + % msort file2 >file2out + % cat file1out file2out >outfile +.fi +.LP +.B dsort +is a disk sort intended for files too large to be sorted in memory. It +uses a four-file polyphase merge algorithm. Since it is an I/O-bound +program, +.BR dsort "'s +speed is very dependant on the speed of the device used for temporary files. +By default, +.BR dsort +will sort files with lines up to 512 characters long. Lines with more +characters will be trucated unless the +.I -l +flag is used. Also by default, 1000 lines at a time will be sorted in +memory during the collection (first) phase of the merge sort algorithm. +This can be changed using the +.I -n +flag. +.BR dsort +will accept only one input file. +.LP +Both +.BR dsort " and " msort +leave the input file(s) intact. +.SH OPTIONS +.nf +\fI-h\fR \fI-?\fR -- print version and usage info, then exit +\fI-l\fR \fIlength\fR -- use a line length of \fIlength\fR +\fI-n\fR \fIlines\fR -- sort \fIlines\fR lines in memory, (for \fBdsort\fR); don't + try to sort files over \fIlines\fR long (for \fBmsort\fR). +\fI-o\fR \fIoutfile\fR -- send sorted output to \fIoutfile\fR rather than to stdout +\fI-t\fR \fIpathlist\fR -- use \fIpathlist\fR as the locations of temp files. If any + of these are not specified, dsort will attempt to use + the directory specified by the environment variable + $(TMPDIR), then the system default temp path. +\fI-v\fR -- verbose operation +\fI-V\fR -- print version information +.fi +.SH HINTS +If you have more than one fast drive, the speed of +.B dsort +can in general be improved by using four different drives for the +path list when using +.I -t . +The best speed observed, however, has occurred when $(TMPDIR) or /tmp +reside on a RAM disk or ROM disk. +It is not suggested that floppies be used for temporary files. +.SH RESOURCE USAGE +Both +.BR dsort " and " msort +use 1k of stack space. +.LP +.BR msort +is an in-place sort, so in general the amount of core memory used is +the same as the size of the file to be sorted. When sorting multiple +files, +.BR msort "'s +memory usage will match the size of the largest input file, not the +total of all files. It will use a minimum of approximately 4k of core +memory. +.LP +.BR dsort +by default uses approximately 512k of core memory. This can be modified +by changing the +.I -l +and +.I -n +parameters. Core memory usage is approximately the product of these two +parameters. +.LP +When using +.BR dsort , +the amount free space on the temporary path(s) must be at least twice +the size of the file to be sorted. +.SH AUTHOR +Devin Reade \- glyn@cs.ualberta.ca +.SH SEE ALSO +.BR sort (1), +.BR uniq (1). diff --git a/usr.bin/sort/dsort.c b/usr.bin/sort/dsort.c new file mode 100644 index 0000000..91d8343 --- /dev/null +++ b/usr.bin/sort/dsort.c @@ -0,0 +1,207 @@ +#ifdef __CCFRONT__ +#include <14:pragma.h> +#endif +/* + * dsort -- sort a text file on disk lexicographically + * + * Synopsis: + * dsort [-hvV?] [-l length] [-n lines] [-o outfile] + * [-t path1[,path2[,path3[,path4]]]] infile + * + * Options: + * -h -? -- print version and usage info, then exit + * -l -- use a line length of + * -n -- sort lines in memory. + * -o -- sorted output to rather than + * to stdout + * -t -- use (up to four paths) as the locations + * of temp files. is of the form: + * path1[,path2[,path3[,path4]]]. If any of these + * are not specified, dsort will attempt to use + * the system default temp path. + * -v -- verbose operation + * -V -- print version information + */ + + +#define DEFFUNC +#define DSORT +#include "common.h" + +#include +#include +#include +#include "/usr/include/getopt.h" /* GNU version */ + +extern int optind; +extern char *optarg; +extern int errno; + +static char *versionstring="\ +Version 1.0 by Devin Reade\n"; + +static char *usagestring="\ +dsort -- Sort a text file on disk lexicographically\n\ +\n\ +Synopsis:\n\ +\tdsort\t[-hvV?] [-l length] [-n lines] [-o outfile]\n\ +\t\t[-t path1[,path2[,path3[,path4]]]] infile\n\ +\n\ +Options:\n\ +\t-h -?\t\t-- Print version and usage info, then exit.\n\ +\t-l \t-- Set the maximum line length to .\n\ +\t-n \t\t-- Set the number of lines to sort in memory to .\n\ +\t-o \t-- Dump sorted output to rather\n\ +\t\t\t than to stdout.\n\ +\t-t \t-- Set the paths to use for the location of\n\ +\t\t\t scratch files. Paths are delimited by \',\' characters.\n\ +\t-v\t\t-- Verbose operation.\n\ +\t-V\t\t-- Print version information.\n"; + + + +int main (int argc, char **argv) { + + size_t lc, i; + char *outfile; /* the name of the output file, if nec */ + char **array; /* an array of strings; for sorting */ + size_t maxlinelen; /* length of longest line in current file */ + size_t maxlinecount; /* max number of lines we want to allow */ + char *tbuffer; /* buffer containing the temp paths */ + short failed=0; /* any errors found? */ + int c; + short errflag=0; + short l_flag=0; + short n_flag=0; + short o_flag=0; + short t_flag=0; + short V_flag=0; + /* v_flag defined in common.h */ + +#ifdef DEBUG + begin_stack_check(); +#endif + + /* + * parse the command line + */ + + while ((c= getopt(argc,argv,"hl:n:o:t:vV?")) != EOF) + switch (c) { + case 'l': /* use this as the maximum line length */ + l_flag++; + errno = 0; + maxlinelen = (size_t) atol(optarg); + if (errno = ERANGE) maxlinelen = DEFAULT_LINELENGTH; + break; + case 'n': /* sort this number of lines in memory */ + n_flag++; + errno = 0; + maxlinecount = (size_t) atol(optarg); + if (errno == ERANGE) maxlinecount = DEFAULT_LINECOUNT; + break; + case 'o': /* redirect sorted output to file */ + o_flag++; + outfile = optarg; + break; + case 't': /* define locations of temp files */ + t_flag++; + if ((tbuffer=malloc(strlen(optarg)+1))==NULL) { + perror("couldn't allocate temporary buffer; using default"); + break; + } + strcpy(tbuffer,optarg); + break; + case 'v': /* verbose */ + v_flag++; + break; + case 'V': /* print version information */ + V_flag++; + break; + case '?': /* fallthrough */ + case 'h': /* fallthrough */ + default: /* Display usage, version, and exit */ + V_flag++; + errflag++; + break; + } + + /* + * React to command line parameters + */ + + if (errflag) { + fprintf (stderr,"\n%s\n%s\n",usagestring,versionstring); + return -1; + } + if (V_flag) fprintf(stderr,"\n%s\n",versionstring); + + if (!l_flag) maxlinelen = DEFAULT_LINELENGTH; + if (!n_flag) maxlinecount = DEFAULT_LINECOUNT; + if (v_flag) fprintf(stderr, + "Sorting %lu lines in memory.\nMaximum recognised line length = %lu\n", + maxlinecount,maxlinelen); + if (o_flag) { + if ((out_fp = fopen(outfile,"w")) == NULL) { + if (v_flag) perror("open on output file failed"); + return -1; + } + } else out_fp = stdout; + + tpath1 = NULL; + tpath2 = NULL; + tpath3 = NULL; + tpath4 = NULL; + if ((t_flag) && (tbuffer!=NULL)) { + char *tp = tbuffer; + + /* set tpath1 */ + tpath1 = tp; + while (*tp && (*tp!=',')) tp++; + if (*tp) { + *tp++ = '\0'; /* terminate tpath1 */ + if (v_flag) fprintf(stderr,"Will try to use temp directory %s\n",tpath1); + + /* set tpath2 */ + tpath2 = tp; + while (*tp && (*tp!=',')) tp++; + if (*tp) { + *tp++ = '\0'; /* terminate tpath2 */ + if (v_flag) + fprintf(stderr,"Will try to use temp directory %s\n",tpath2); + + /* set tpath3 */ + tpath3 = tp; + while (*tp && (*tp!=',')) tp++; + if (*tp) { + *tp++ = '\0'; /* terminate tpath3 */ + if (v_flag) + fprintf(stderr,"Will try to use temp directory %s\n",tpath3); + + /* set tpath4 */ + tpath4 = tp; + while (*tp && (*tp!=',')) tp++; + *tp = '\0'; /* terminate tpath4 */ + if (v_flag) + fprintf(stderr,"Will try to use temp directory %s\n",tpath4); + } + } + } + } else if (v_flag) fprintf(stderr,"Using default temp path\n"); + + /* do the sort */ + if (argc - optind == 1) { + c = disksort(argv[optind],maxlinecount,maxlinelen); + } else { + fprintf(stderr,"\n%s\n%s\n",usagestring,versionstring); + c = -1; + } + + if (t_flag && (tbuffer)) free(tbuffer); + +#ifdef DEBUG + fprintf(stderr,"%s stack usage: %d bytes\n",argv[0],end_stack_check()); +#endif + + return c; +} diff --git a/usr.bin/sort/initdisksort.c b/usr.bin/sort/initdisksort.c new file mode 100644 index 0000000..3fa4a98 --- /dev/null +++ b/usr.bin/sort/initdisksort.c @@ -0,0 +1,105 @@ +#ifdef __CCFRONT__ +#include <14:pragma.h> +#endif + +#define DSORT +#include "common.h" +#include +#include + +#define OPENMODE "w+" /* the mode for create/read/write */ + +static char *errstring1="initdisksort: couldn't get temp name"; +static char *errstring2="initdisksort: couldn't open temp file"; + +/* + * int initdisksort(void); + * + * Pre: None. + * + * Post: Returns 0 on success, -1 on failure. + * On success: + * file1 through file4 are initialized as temp file names. + * fp1 through fp4 are open file pointers for file1 ... file4. + * + * Uses Globals: + * fp1, fp2, fp3, fp4 + * file1, file2, file3, file4, + * v_flag + */ + +int initdisksort(void) { + + /* + * Get the names for the temp files -- this is ponderous but necessary + */ + + if ((file1 = tempnam(tpath1,"dsort")) == NULL) { + if (v_flag) perror(errstring1); + return -1; + } + if ((file2 = tempnam(tpath2,"dsort")) == NULL) { + if (v_flag) perror(errstring1); + free(file1); + return -1; + } + if ((file3 = tempnam(tpath3,"dsort")) == NULL) { + if (v_flag) perror(errstring1); + free(file1); + free(file2); + return -1; + } + if ((file4 = tempnam(tpath4,"dsort")) == NULL) { + if (v_flag) perror(errstring1); + free(file1); + free(file2); + free(file3); + return -1; + } + + /* + * Open the temp files -- again ponderous but necessary + */ + + + if ((fp1 = fopen(file1,OPENMODE))==NULL) { + if (v_flag) perror(errstring2); + free(file1); + free(file2); + free(file3); + free(file4); + return -1; + } + if ((fp2 = fopen(file2,OPENMODE))==NULL) { + if (v_flag) perror(errstring2); + unlink(file1); + free(file1); + free(file2); + free(file3); + free(file4); + return -1; + } + if ((fp3 = fopen(file3,OPENMODE))==NULL) { + if (v_flag) perror(errstring2); + unlink(file1); + unlink(file2); + free(file1); + free(file2); + free(file3); + free(file4); + return -1; + } + if ((fp4 = fopen(file4,OPENMODE))==NULL) { + if (v_flag) perror(errstring2); + unlink(file1); + unlink(file2); + unlink(file3); + free(file1); + free(file2); + free(file3); + free(file4); + return -1; + } + + return 0; +} diff --git a/usr.bin/sort/linecount.c b/usr.bin/sort/linecount.c new file mode 100644 index 0000000..7ece4a4 --- /dev/null +++ b/usr.bin/sort/linecount.c @@ -0,0 +1,89 @@ +#ifdef __CCFRONT__ +#include <14:pragma.h> +#endif + +#include "common.h" + +#include +#include +#include + +/* + * unsigned long int linecount (char *filename, size_t *maxlinelen); + * + * Pre: is the name of the file for which we need to know + * the number of lines. The file must be closed. + * + * Post: Returns the number of newline characters in the file. On + * return, the file is again closed and *maxlinelen is the length + * of the longest line in (length is calculated to + * include the newline character but not the null terminator. + * Returns zero on failure or if there are no newlines. + * + * Uses Globals: + * v_flag + */ + +unsigned long int linecount (char *filename, size_t *maxlinelen) { + + char *buff; /* the input buffer */ + unsigned long result; /* the number of newlines */ + int count; /* the number of chars last read */ + int fd; /* file descriptor for */ + short done; + int i; + size_t linelen; /* length of current line */ + + /* init some variables */ + done = 0; + result = 0; + *maxlinelen = 0; + linelen = 0; + + /* open for unbuffered I/O */ + if ((fd = open(filename,O_RDONLY)) == -1) { + if (v_flag) perror("linecount: couldn't open input file"); + return 0lu; + } + + /* get an input buffer */ + if ((buff = malloc(BUFFERSIZE)) == NULL) { + if (v_flag) perror ("linecount: couldn't allocate buffer"); + close(fd); + return 0lu; + } + + /* repeatedly fill the buffer and increment the newline count */ + while (!done) { + count = read (fd, buff, BUFFERSIZE); + switch (count) { + case -1: /* file error */ + if (v_flag) perror ("linecount"); + close(fd); + free(buff); + return 0lu; + /* NOTREACHED */ + break; + + case 0: /* EOF */ + done = 1; + break; + + default: /* got some info in the buffer */ + for (i=0; i *maxlinelen) *maxlinelen = linelen; + linelen = 0; + } + } + break; + } + } + + /* clean up and return */ + close(fd); + free(buff); + return result; +} diff --git a/usr.bin/sort/loadarray.c b/usr.bin/sort/loadarray.c new file mode 100644 index 0000000..9d18191 --- /dev/null +++ b/usr.bin/sort/loadarray.c @@ -0,0 +1,121 @@ +#ifdef __CCFRONT__ +#include <14:pragma.h> +#endif + +#include "common.h" + +#include +#include +#include + +/* + * char **loadarray (unsigned long n, char *filename, size_t maxlinelen); + * + * Pre: is the name of a file containing lines of text, + * the number of lines. The file must be closed. + * is the length of the longest line in + * + * Post: Returns a pointer to an array of pointers to malloc'd strings, + * where the strings are successive lines from the file . + * On return will be closed. If loadarray() fails for + * any reason, it will return NULL. + * + * Warning: The use of realloc() with a NULL pointer for the initial + * allocation may not be portable. If this is not valid for your + * current libraries, then #define BROKEN_REALLOC. + * + * Uses Globals: + * v_flag + */ + +char **loadarray (unsigned long n, char *filename, size_t maxlinelen) { + + char **result; + unsigned long i,j; + FILE *in_fp; + static char *inbuf=NULL; + static size_t previous_size = 0; + char *p; + + +#ifndef BROKEN_REALLOC /* realloc() is ANSI-compliant with NULL first arg */ + + /* reallocate the input buffer if necessary */ + if (maxlinelen > previous_size) { + if ((p = realloc(inbuf,maxlinelen+1)) == NULL) { + if (v_flag) perror("loadarray: couldn't (re)allocate input buffer"); + return NULL; + } + previous_size = maxlinelen; + inbuf = p; + } + +#else /* BROKEN_REALLOC */ + + /* reallocate the input buffer if necessary */ + if (maxlinelen > previous_size) { + if (previous_size == 0) { + if ((p = malloc(maxlinelen+1)) == NULL) { + if (v_flag) perror("loadarray: couldn't allocate input buffer"); + return NULL; + } + } else { + if ((p = realloc(inbuf,maxlinelen+1)) == NULL) { + if (v_flag) perror("loadarray: couldn't reallocate input buffer"); + return NULL; + } + } + previous_size = maxlinelen; + inbuf = p; + } + + +#endif /* BROKEN_REALLOC */ + + /* allocate the array */ + if ((result = malloc (n * sizeof(char *)))==NULL) { + if (v_flag) perror("loadarray: couldn't allocate base array"); + return NULL; + } + + /* set up the input stream */ + in_fp = fopen(filename,"r"); + if (in_fp == NULL) { /* open failed */ + free(result); + if (v_flag) perror("loadarray: couldn't open input file"); + return NULL; + } + + /* allocate and copy elements */ + for (i=0; i +#endif + +#define DSORT +#include "common.h" + +#include + +/* + * int mergeone(FILE *fpA, FILE *fpB, FILE *fpC, char strA[], char strB[], + * size_t linelength); + * + * Pre: fpA, fpB, and fpC are open file pointers. The first should contain + * "runs" of data delimited by a line consisting of just the DELIM + * character, although either or both may be at EOF. strA and strB + * are scratch character buffers, each of size linelength. + * + * Post: The first run on each of fpA and fpB are merge-sorted and added to + * fpC. If either fpA or fpB are at EOF then the run from the other + * file pointer is simply concatenated onto fpC. Mergeone will return + * zero if the merge was successful, -1 if both fpA and fpB are at + * EOF, and -2 if there was an error. On return, the contents of + * strA and strB are undefined. + * + * Uses Globals: + * v_flag -- if set and an error occurs, a message will be printed + * to stderr + * fp1,fp2,fp3,fp4 -- file pointers to the four scratch files + * file1,file2,file3,file4 -- names of the four scratch files + */ + + +int mergeone(FILE *fpA, FILE *fpB, FILE *fpC, char strA[], char strB[], + size_t linelength) { + + short run_end_A = 0; + short run_end_B = 0; + + /* + * Load strA and strB with the first lines from fpA and fpB. After + * this, either file may be at EOF (but not error). + */ + + if ((fgets(strA,linelength,fpA)==NULL) && ferror(fpA)) { + if (v_flag) perror("mergeone: Read error on fpA"); + return -2; + } + if ((fgets(strB,linelength,fpB)==NULL) && ferror(fpB)) { + if (v_flag) perror("mergeone: Read error on fpB"); + return -2; + } + + /* + * merge fpA and fpB until we either get an EOF or a DELIM line + */ + + while (!feof(fpA) && !feof(fpB)) { + + /* test to see if our run is finished */ + if ((strA[0]==DELIM) && (strA[1]=='\n')) { + run_end_A = 1; + break; + } + if ((strB[0]==DELIM) && (strB[1]=='\n')) { + run_end_B = 1; + break; + } + + if (strcmp(strA,strB) < 0) { + + /* print out the string to fpC */ + if (fprintf(fpC,"%s",strA) == EOF) { + if (v_flag) perror("mergeone: Write error on fpC"); + return -2; + } + + /* get another string from fpA */ + if ((fgets(strA,linelength,fpA)==NULL) && ferror(fpA)) { + if (v_flag) perror("mergeone: Read error on fpA"); + return -2; + } + + } else { + + /* print out the string to fpC */ + if (fprintf(fpC,"%s",strB) == EOF) { + if (v_flag) perror("mergeone: Write error on fpC"); + return -2; + if (v_flag) { + /* say something */ + } + return -2; + } + + /* get another string from fpB */ + if ((fgets(strB,linelength,fpB)==NULL) && ferror(fpB)) { + if (v_flag) perror("mergeone: Read error on fpB"); + return -2; + } + } + } + + /* + * We've come to the end of at least one of the runs, concatenate + * the remainder on the output file + */ + + /* finish off fpA if necessary */ + while (!run_end_A && !feof(fpA)) { + + /* test to see if our run is finished */ + if ((strA[0]==DELIM) && (strA[1]=='\n')) { + run_end_A = 1; + break; + } + + /* print out the string to fpC */ + if (fprintf(fpC,"%s",strA) == EOF) { + if (v_flag) perror("mergeone: Write error on fpC"); + return -2; + } + + /* get another string from fpA */ + if ((fgets(strA,linelength,fpA)==NULL) && ferror(fpA)) { + if (v_flag) perror("mergeone: Read error on fpA"); + return -2; + } + } + + /* finish off fpB if necessary */ + while (!run_end_B && !feof(fpB)) { + + /* test to see if our run is finished */ + if ((strB[0]==DELIM) && (strB[1]=='\n')) { + run_end_B = 1; + break; + } + + /* print out the string to fpC */ + if (fprintf(fpC,"%s",strB) == EOF) { + if (v_flag) perror("mergeone: Write error on fpC"); + return -2; + } + + /* get another string from fpB */ + if ((fgets(strB,linelength,fpB)==NULL) && ferror(fpB)) { + if (v_flag) perror("mergeone: Read error on fpB"); + return -2; + } + } + + /* + * At this point, both fpA and fpB are either at a run-end or at EOF, + * with no errors. If at EOF, then don't append a DELIM character. + */ + + if (feof(fpA) && feof(fpB)) return -1; + if (fprintf(fpC,"%c\n",DELIM) == EOF) { + if (v_flag) perror("mergeone: Write error on fpC"); + return -2; + } + return 0; +} + + + + + diff --git a/usr.bin/sort/msort.1 b/usr.bin/sort/msort.1 new file mode 100644 index 0000000..9618d80 --- /dev/null +++ b/usr.bin/sort/msort.1 @@ -0,0 +1 @@ +.so /usr/man/man1/dsort.1 diff --git a/usr.bin/sort/msort.c b/usr.bin/sort/msort.c new file mode 100644 index 0000000..f85399c --- /dev/null +++ b/usr.bin/sort/msort.c @@ -0,0 +1,155 @@ +#ifdef __CCFRONT__ +#include <14:pragma.h> +#endif +/* + * msort -- sort a text file in memory lexicographically + * + * Synopsis: + * msort [-hvV?] [-o outfile] [-n lines] file1 [file2 ...] + * + * Options: + * -h -? -- print version and usage info, then exit + * -n -- don't try to sort files over lines long + * -o -- sorted output to rather than + * to stdout + * -v -- verbose operation + * -V -- print version information + */ + +#define DEFFUNC +#define MSORT +#include "common.h" + +#include +#include +#include +#include "/usr/include/getopt.h" /* GNU version */ + +extern int optind; +extern char *optarg; +extern int errno; + +static char *versionstring="\ +Version 1.0 by Devin Reade\n"; + +static char *usagestring="\ +msort -- Sort a text file in memory lexicographically\n\ +\n\ +Synopsis:\n\ +\tmsort [-hvV?] [-o outfile] file1 [file2 ...]\n\ +\n\ +Options:\n\ +\t-h -?\t\t-- Print version and usage info, then exit.\n\ +\t-n \t\t-- Set the maximum number of lines per file to .\n\ +\t-o \t-- Dump sorted output to rather\n\ +\t\t\t than to stdout.\n\ +\t-v\t\t-- Verbose operation.\n\ +\t-V\t\t-- Print version information.\n"; + +int main (int argc, char **argv) { + + size_t lc, i; + char *outfile; /* the name of the output file, if nec */ + char **array; /* an array of strings; for sorting */ + size_t maxlinelen; /* length of longest line in current file */ + size_t maxlinecount; /* max number of lines we want to allow */ + short failed=0; /* any errors found? */ + int c; + short errflag=0; + short n_flag=0; + short o_flag=0; + short V_flag=0; + /* v_flag defined in common.h */ + +#ifdef DEBUG + begin_stack_check(); +#endif + + /* + * parse the command line + */ + + while ((c= getopt(argc,argv,"hn:o:vV?")) != EOF) + switch (c) { + case 'n': /* don't try to sort if file is over n lines long */ + n_flag++; + errno = 0; + maxlinecount = (size_t) atol(optarg); + if (errno == ERANGE) maxlinecount = DEFAULT_LINECOUNT; + break; + case 'o': /* redirect sorted output to file */ + o_flag++; + outfile = optarg; + break; + case 'v': /* verbose */ + v_flag++; + break; + case 'V': /* print version information */ + V_flag++; + break; + case '?': /* fallthrough */ + case 'h': /* fallthrough */ + default: /* Display usage, version, and exit */ + V_flag++; + errflag++; + break; + } + + /* + * React to command line parameters + */ + + if (errflag) { + fprintf(stderr,"\n%s\n%s\n",usagestring,versionstring); + return -1; + } + if (V_flag) fprintf(stderr,"\n%s\n",versionstring); + if (!n_flag) maxlinecount = DEFAULT_LINECOUNT; + if (v_flag) fprintf(stderr,"Maximum lines per file = %lu\n",maxlinecount); + + if (o_flag) { + if ((out_fp = fopen(outfile,"w")) == NULL) { + if (v_flag) perror("open on output file failed"); + return -1; + } + } else out_fp = stdout; + + /* loop through files */ + for (; optindmaxlinecount) { + if (v_flag) + fprintf(stderr,"%s too long for an in-memory sort -- file skipped\n", + argv[optind]); + failed = 1; + continue; + } + + /* load the array */ + array = loadarray (lc, argv[optind], maxlinelen); + if (array == NULL) { + if (v_flag) fprintf(stderr,"Ignoring file %s\n",argv[optind]); + failed = 1; + continue; + } + + /* sort it */ + sortarray (array,lc); + + /* print the sorted file out and clean up the array */ + for (i=0; i +#endif + +#include "common.h" + +#include + +/* + * void sortarray(char **array, unsigned long n); + * + * Pre: is a pointer to an array of pointers to NULL-terminated + * strings, and is the number of elements in + * + * Post: The strings in are sorted lexicographically in ascending + * order, using the heapsort algorithm. This is an in-place + * non-recursive sort with behavior O[n*lg(n)] both on average + * and worst-case. + */ + +void sortarray(char *array[], unsigned long n) { + + long l, j, ir, i; + char *rra; + + if (n==1) return; /* no need to sort one element */ + --array; /* fudge since the algorithm was designed */ + /* for a unit-indexing */ + + l = (n>>1) + 1; + ir = n; + + /* + * The index l will be decremented from its initial value down to 0 during + * the heap creation phase. Once it reaches 0, the index ir will be + * decremented from its initial value down to 0 during the heap selection + * phase. + */ + for (;;) { + if (l > 1) /* still in creation phase */ + rra = array[--l]; + else { /* in selection phase */ + rra= array[ir]; /* clear a space at the end of array */ + array[ir] = array[1]; /* retire the top of the heap into it */ + if (--ir == 1) { /* done with the last promotion */ + array[1] = rra; + return; + } + } + i = l; /* set up to sift down element rra to its proper place */ + j = l << 1; + while (j<=ir) { + if (j +#endif +/* + * #include + * + * char *tempnam (const char *dir, const char *prefix); + * + * Generate a pathname for a temporary file. + * + * tempnam will select a directory for the temporary file by using the + * following criteria: + * + * If dir is not the NULL pointer, tempnam uses the pathname pointed to by + * dir as the directory, + * + * otherwise, tmpdir uses the value of the TMPDIR environment variable if + * the variable is defined, + * + * otherwise the directory defined by P_tmpdir in the stdio.h header file + * if that directory is writable by the caller, + * + * otherwise, tempnam will use "/tmp" as a last resort. + */ + +#ifdef __ORCAC__ +#define __GNO__ 1 +#endif + +#include +#include +#include +#include + +#define max(A,B) (((A)<(B))?(B):(A)) + +#if !defined(__GNO__) +extern char *mktemp(); +extern int access(); +#endif + +static char seed[4]="AAA"; + +#if (defined __GNO__) +static char pbrk; +#else +# define pbrk '/'; +#endif + +/* BSD stdio.h doesn't define P_tmpdir, so let's do it here */ +#ifndef P_tmpdir +static char *P_tmpdir = "/tmp"; +#endif + + +static char * +cpdir(char *buf, char *str) +{ + char *p; + char *path; + + if(str != NULL) { + +#if defined(__GNO__) + /* get the path delimiter */ + if (strchr(str,':')) pbrk = ':'; + else if (strchr(str,'/')) pbrk = '/'; + else { + if ((path=getenv("PATH"))==NULL) pbrk = '/'; + else pbrk = (strchr(path,':')) ? ':' : '/'; + } +#endif + + (void) strcpy(buf, str); + p = buf - 1 + strlen(buf); + if(*p == pbrk) *p = '\0'; + } + + return(buf); +} + + +char * +tempnam (char *dir, char *prefix) + /* dir -- use this directory please (if non-NULL) */ + /* prefix -- use this (if non-NULL) as filename prefix */ +{ + register char *p, *q, *tmpdir; + int tl=0, dl=0, pl; + + /* create a buffer

that's as large as necessary */ + pl = strlen(P_tmpdir); + if( (tmpdir = getenv("TMPDIR")) != NULL ) tl = strlen(tmpdir); + if( dir != NULL ) dl = strlen(dir); + if( (p = malloc((unsigned int)(max(max(dl,tl),pl)+16))) == NULL ) + return(NULL); + *p = '\0'; + +#if defined (__GNO__) + if( (dl == 0) || (access( cpdir(p, dir), W_OK) != 0) ) + if( (tl == 0) || (access( cpdir(p, tmpdir), W_OK) != 0) ) + if( access( cpdir(p, P_tmpdir), W_OK) != 0 ) + if( access( cpdir(p, "/tmp"), W_OK) != 0 ) + return(NULL); + +#else /* not __GNO__ */ + if( (dl == 0) || (access( cpdir(p, dir), 3) != 0) ) + if( (tl == 0) || (access( cpdir(p, tmpdir), 3) != 0) ) + if( access( cpdir(p, P_tmpdir), 3) != 0 ) + if( access( cpdir(p, "/tmp"), 3) != 0 ) + return(NULL); +#endif /* not __GNO__ */ + + (void) strcat(p, "/"); + if(prefix) + { + *(p+strlen(p)+5) = '\0'; + (void)strncat(p, prefix, 5); + } + + (void)strcat(p, seed); + (void)strcat(p, "XXXXXX"); + + q = seed; + while(*q == 'Z') *q++ = 'A'; + ++*q; + + if(*mktemp(p) == '\0') return(NULL); + return(p); +}