Path: utzoo!attcan!uunet!husc6!bbn!uwmcsd1!ig!agate!ucbvax!CORNELLC.CCS.CORNELL.EDU.BITNET!ewilts%Ins.MRC.AdhocNet.CA%Stasis.MRC.AdhocNet.CA%UNCAEDU.
From: ewilts%Ins.MRC.AdhocNet.CA%Stasis.MRC.AdhocNet.CA%UNCAEDU.@CORNELLC.CCS.CORNELL.EDU.BITNET (Ed Wilts)
Newsgroups: comp.os.vms
Subject: ARC_C.SHAR07_OF_19
Message-ID: <880624092451.027@Ins.MRC.AdhocNet.CA>
Date: 24 Jun 88 15:24:47 GMT
Sender: daemon@ucbvax.BERKELEY.EDU
Organization: The Internet
Lines: 430
$Part08:
$ File_is="ARCLZW.C"
$ Check_Sum_is=606678970
$ Copy SYS$Input VMS_SHAR_DUMMY.DUMMY
Xstatic char *RCSid = "$Header: arclzw.c,v 1.2 86/07/15 07:53:20 turner Exp $";
X
X/*
X * $Log:`009arclzw.c,v $
X * Hack-attack 1.3 86/12/20 01:23:45 wilhite@usceast.uucp
X * `009Bludgeoned into submission for VAX 11/780 BSD4.2
X *`009(ugly code, but fewer core dumps)
X *
X * Revision 1.2 86/07/15 07:53:20 turner
X *
X *
X * Revision 1.1 86/06/26 15:00:26 turner
X * initial version
X *
X *
X */
X
X/* ARC - Archive utility - ARCLZW
X
X$define(tag,$$segment(@1,$$index(@1,=)+1))#
X$define(version,Version $tag(
XTED_VERSION DB =1.88), created on $tag(
XTED_DATE DB =01/20/86) at $tag(
XTED_TIME DB =16:47:04))#
X$undefine(tag)#
X $version
X
X(C) COPYRIGHT 1985 by System Enhancement Associates; ALL RIGHTS RESERVED
X
X By: Thom Henderson
X
X Description:
X This file contains the routines used to implement Lempel-Zev
X data compression, which calls for building a coding table on
X the fly. This form of compression is especially good for encoding
X files which contain repeated strings, and can often give dramatic
X improvements over traditional Huffman SQueezing.
X
X Language:
X Computer Innovations Optimizing C86
X
X Programming notes:
X In this section I am drawing heavily on the COMPRESS program
X from UNIX. The basic method is taken from "A Technique for High
X Performance Data Compression", Terry A. Welch, IEEE Computer
X Vol 17, No 6 (June 1984), pp 8-19. Also see "Knuth's Fundamental
X Algorithms", Donald Knuth, Vol 3, Section 6.4.
X
X As best as I can tell, this method works by tracing down a hash
X table of code strings where each entry has the property:
X
X if is in the table
X then is in the table.
X*/
X#include
X#include "arc.h"
X
X/* definitions for older style crunching */
X
X#define FALSE 0
X#define TRUE !FALSE
X#define TABSIZE 4096
X#define NO_PRED 0xFFFF
X#define EMPTY 0xFFFF
X#define NOT_FND 0xFFFF
X
Xstatic unsigned INT inbuf; /* partial input code storage */
Xstatic INT sp; /* current stack pointer */
X
Xstatic struct entry /* string table entry format */
X{ char used; /* true when this entry is in use */
X unsigned INT next; /* ptr to next in collision list */
X unsigned INT predecessor; /* code for preceeding string */
X unsigned char follower; /* char following string */
X} string_tab[TABSIZE]; /* the code string table */
X
X
X/* definitions for the new dynamic Lempel-Zev crunching */
X
X#define BITS 12 /* maximum bits per code */
X#define HSIZE 5003 /* 80% occupancy */
X#define INIT_BITS 9 /* initial number of bits/code */
X
Xstatic INT n_bits; /* number of bits/code */
Xstatic INT maxcode; /* maximum code, given n_bits */
X#define MAXCODE(n) ((1<<(n)) - 1) /* maximum code calculation */
Xstatic INT maxcodemax = 1 << BITS; /* largest possible code (+1) */
X
Xstatic unsigned char buf[BITS]; /* input/output buffer */
X
Xstatic unsigned char lmask[9] = /* left side masks */
X{ 0xff, 0xfe, 0xfc, 0xf8, 0xf0, 0xe0, 0xc0, 0x80, 0x00 };
Xstatic unsigned char rmask[9] = /* right side masks */
X{ 0x00, 0x01, 0x03, 0x07, 0x0f, 0x1f, 0x3f, 0x7f, 0xff};
X
Xstatic INT offset; /* byte offset for code output */
Xstatic long in_count; /* length of input */
Xstatic long bytes_out; /* length of compressed output */
Xstatic unsigned INT ent;
X
X/* To save much memory (which we badly need at this point), we overlay
X * the table used by the previous version of Lempel-Zev with those used
X * by the new version. Since no two of these routines will be used
X * together, we can safely do this. Note that the tables used for Huffman
X * squeezing may NOT overlay these, since squeezing and crunching are done
X * in parallel.
X */
X
X#if MSDOS
Xstatic long *htab = string_tab; /* hash code table (crunch) */
X#endif
X#if BSD | ST
Xstatic long htab[HSIZE]; /* hash code table (crunch) */
X#endif
Xstatic unsigned INT codetab[HSIZE]; /* string code table (crunch) */
X
Xstatic unsigned INT *prefix = codetab; /* prefix code table (uncrunch) */
X
X#if MSDOS
Xstatic unsigned char *suffix = string_tab; /* suffix table (uncrunch) */
X#endif
X#if BSD | ST
Xstatic unsigned char suffix[HSIZE]; /* suffix table (uncrunch) */
X#endif
Xstatic INT free_ent; /* first unused entry */
Xstatic INT firstcmp; /* true at start of compression */
Xstatic unsigned char stack[HSIZE]; /* local push/pop stack */
X
X/*
X * block compression parameters -- after all codes are used up,
X * and compression rate changes, start over.
X */
X
Xstatic INT clear_flg;
Xstatic long ratio;
X#define CHECK_GAP 10000 /* ratio check interval */
Xstatic long checkpoint;
X
X/*
X * the next two codes should not be changed lightly, as they must not
X * lie within the contiguous general code space.
X */
X#define FIRST 257 /* first free entry */
X#define CLEAR 256 /* table clear output code */
X
Xstatic INT cl_block(t) /* table clear for block compress */
XFILE *t; /* our output file */
X{
X long rat;
X INT putcode();
X
X checkpoint = in_count + CHECK_GAP;
X
X if(in_count > 0x007fffff) /* shift will overflow */
X { rat = bytes_out >> 8;
X if(rat == 0) /* Don't divide by zero */
X rat = 0x7fffffff;
X else rat = in_count / rat;
X }
X else rat = (in_count<<8)/bytes_out;/* 8 fractional bits */
X
X if(rat > ratio)
X ratio = rat;
X else
X { ratio = 0;
X setmem`009(htab,HSIZE*sizeof(long),0xff);
X free_ent = FIRST;
X clear_flg = 1;
X putcode(CLEAR,t);
X }
X}
X
X/*****************************************************************
X *
X * Output a given code.
X * Inputs:
X * code: A n_bits-bit integer. If == -1, then EOF. This assumes
X * that n_bits =< (long)wordsize - 1.
X * Outputs:
X * Outputs code to the file.
X * Assumptions:
X * Chars are 8 bits long.
X * Algorithm:
X * Maintain a BITS character long buffer (so that 8 codes will
X * fit in it exactly). When the buffer fills up empty it and start over.
X */
X
Xstatic INT putcode(code,t) /* output a code */
XINT code; /* code to output */
XFILE *t; /* where to put it */
X{
X INT r_off = offset; /* right offset */
X INT bits = n_bits; /* bits to go */
X unsigned char *bp = buf; /* buffer pointer */
X INT n; /* index */
X
X if(code >= 0) /* if a real code */
X { /*
X * Get to the first byte.
X */
X bp += (r_off >> 3);
X r_off &= 7;
X
X /*
X * Since code is always >= 8 bits, only need to mask the first
X * hunk on the left.
X */
X *bp = (*bp&rmask[r_off]) | (code<>= (8 - r_off);
X
X /* Get any 8 bit parts in the middle (<=1 for up to 16 bits). */
X if(bits >= 8)
X { *bp++ = code;
X code >>= 8;
X bits -= 8;
X }
X
X /* Last bits. */
X if(bits)
X *bp = code;
X
X offset += n_bits;
X
X if(offset == (n_bits << 3))
X { bp = buf;
X bits = n_bits;
X bytes_out += bits;
X do
X putc_pak(*bp++,t);
X while(--bits);
X offset = 0;
X }
X
X /*
X * If the next entry is going to be too big for the code size,
X * then increase it, if possible.
X */
X if(free_ent>maxcode || clear_flg>0)
X { /*
X * Write the whole buffer, because the input side won't
X * discover the size increase until after it has read it.
X */
X if(offset > 0)
X { bp = buf; /* reset pointer for writing */
X bytes_out += n = n_bits;
X while(n--)
X putc_pak(*bp++,t);
X }
X offset = 0;
X
X if(clear_flg) /* reset if clearing */
X { maxcode = MAXCODE(n_bits = INIT_BITS);
X clear_flg = 0;
X }
X else /* else use more bits */
X { n_bits++;
X if(n_bits == BITS)
X maxcode = maxcodemax;
X else
X maxcode = MAXCODE(n_bits);
X }
X }
X }
X
X else /* dump the buffer on EOF */
X { bytes_out += n = (offset+7) / 8;
X
X if(offset > 0)
X while(n--)
X putc_pak(*bp++,t);
X offset = 0;
X }
X}
X
X/*****************************************************************
X *
X * Read one code from the standard input. If EOF, return -1.
X * Inputs:
X * cmpin
X * Outputs:
X * code or -1 is returned.
X */
X
Xstatic INT getcode(f) /* get a code */
XFILE *f; /* file to get from */
X{
X INT code;
X static INT offset = 0, size = 0;
X INT r_off, bits;
X unsigned char *bp = buf;
X
X if(clear_flg > 0 || offset >= size || free_ent > maxcode)
X { /*
X * If the next entry will be too big for the current code
X * size, then we must increase the size. This implies reading
X * a new buffer full, too.
X */
X if(free_ent > maxcode)
X { n_bits++;
X if(n_bits == BITS)
X maxcode = maxcodemax; /* won't get any bigger now */
X else maxcode = MAXCODE(n_bits);
X }
X if(clear_flg > 0)
X { maxcode = MAXCODE(n_bits = INIT_BITS);
X clear_flg = 0;
X }
X
X for(size=0; size> 3);
X r_off &= 7;
X
X /* Get first part (low order bits) */
X code = (*bp++ >> r_off);
X bits -= 8 - r_off;
X r_off = 8 - r_off; /* now, offset into code word */
X
X /* Get any 8 bit parts in the middle (<=1 for up to 16 bits). */
X if(bits >= 8)
X { code |= *bp++ << r_off;
X r_off += 8;
X bits -= 8;
X }
X /* high order bits. */
X code |= (*bp & rmask[bits]) << r_off;
X offset += n_bits;
X
X return code;
X}
X
X/*
X * compress a file
X *
X * Algorithm: use open addressing double hashing (no chaining) on the
X * prefix code / next character combination. We do a variant of Knuth's
X * algorithm D (vol. 3, sec. 6.4) along with G. Knott's relatively-prime
X * secondary probe. Here, the modular division first probe is gives way
X * to a faster exclusive-or manipulation. Also do block compression with
X * an adaptive reset, where the code table is cleared when the compression
X * ratio decreases, but after the table fills. The variable-length output
X * codes are re-sized at this point, and a special CLEAR code is generated
X * for the decompressor.
X */
X
XINT init_cm(f,t) /* initialize for compression */
XFILE *f; /* file we will be compressing */
XFILE *t; /* where we will put it */
X{
X offset = 0;
X bytes_out = 1;
X clear_flg = 0;
X ratio = 0;
X in_count = 1;
X checkpoint = CHECK_GAP;
X maxcode = MAXCODE(n_bits = INIT_BITS);
X free_ent = FIRST;
X setmem(htab,HSIZE*sizeof(long),0xff);
X n_bits = INIT_BITS; /* set starting code size */
X
X putc_pak(BITS,t); /* note our max code length */
X
X firstcmp = 1; /* next byte will be first */
X}
X
XINT putc_cm(c,t) /* compress a character */
Xunsigned char c; /* character to compress */
XFILE *t; /* where to put it */
X{
X static long fcode;
X static INT hshift;
X register INT i;
X register INT disp;
X
X if(firstcmp) /* special case for first byte */
X { ent = c; /* remember first byte */
X
X hshift = 0;
X for(fcode=(long)HSIZE; fcode<65536L; fcode*=2L)
X hshift++;
X hshift = 8 - hshift; /* set hash code range bund */
X
X firstcmp = 0; /* no longer first */
X return;
X }
X
X in_count++;
X fcode =(long)(((long)c << BITS)+ent);
X i = (c<