/* * l z c m p 2 . c * * Actually do compression. */ #include "lz.h" static int offset; long int in_count = 1; /* length of input */ long int bytes_out; /* length of compressed output */ long int out_count = 0; /* # of codes output (debug) */ flag clear_flg = FALSE; /* * These global parameters are written to the compressed file. * The decompressor needs them. */ extern short maxbits; /* settable max # bits/code */ extern short block_compress; extern code_int maxmaxcode; /* * Set by the mainline code and/or static. */ /* * Local variables to the compressor. */ code_int maxcode; int n_bits; double ratio; count_int checkpoint = CHECK_GAP; #ifdef decus /* * dsect is a Decus C specific command that tells the linker/task-builder * to put the following data in a specific program section. It is * not necessary, but lets me see how much space various HSIZE etc. * options require. */ dsect "xxhtab"; #endif count_int htab[HSIZE]; U_short codetab[HSIZE]; extern code_int hsize; code_int free_ent = 0; flag using_fast; /* Not using hashing */ #ifdef DEBUG code_int largecode = 0; int largebits = 0; int clearcount = 0; #endif #if IS_CACHE /* * Define parameters for the "character hog" check. This optimizes * hash table lookup for the "most popular" character. This improves * performance for raster images at a modest increase in memory. * (which real small machines can't afford). */ #ifdef decus dsect "xxcach"; #endif #define HOG_CHECK ((count_int) 2000) /* Number to read before check */ #define MAX_CACHE ((count_int) 1< 0 /* * Machines with LOTS of memory don't bother to hash the codes. */ short ftable[ (1 << FBITS) * 256]; count_int fcodemem[1 << FBITS]; extern long fsize; /* File size */ #endif /* * compress stdin to stdout * * Algorithm: on large machines, for maxbits <= FBITS, use fast direct table * lookup on the prefix code / next character combination. For smaller code * size, use open addressing modular division double hashing (no chaining), ala * Knuth vol. 3, sec. 6.4 Algorithm D, along with G. Knott's relatively-prime * secondary probe. Do block compression with an adaptive reset, whereby the * code table is cleared when the compression ratio decreases, but after the * table fills. The variable-length output codes are re-sized at this point, * and a special CLEAR code is generated for the decompressor. For the * megamemory version, the sparse array is cleared indirectly through a * "shadow" output code history. Late additions: for the hashing code, * construct the table according to file size for noticeable speed improvement * on small files. Also detect and cache codes associated with the most * common character to bypass hash calculation on these codes (a characteristic * of highly-compressable raster images). Please direct questions about this * implementation to ames!jaw. */ int compress() /* * Compression mainline. TRUE if successful. */ { register long fcode; register code_int i; register int c; /* Current input char */ register code_int ent; register int disp; register code_int hsize_reg; code_int initial_probe; /* Hash full tester */ i = 0; offset = 0; bytes_out = 0; out_count = 0; clear_flg = FALSE; using_fast = FALSE; /* Not using direct addressing */ ratio = 0.0; in_count = 1; checkpoint = CHECK_GAP; n_bits = INIT_BITS; maxcode = MAXCODE(n_bits); free_ent = ((block_compress) ? FIRST : 256); #ifdef DEBUG largecode = free_ent; largebits = n_bits; clearcount = 0; #endif ent = GET(); #if USERMEM > 0 /* * Maybe use direct addressing for big files and huge systems. */ if (maxbits <= FBITS && (fsize >= 30000)) { using_fast = TRUE; while ((c = GET()) != EOF) { in_count++; fcode = (long) (((long) c << maxbits) + ent); /* * test for code in "string" table */ if (ftable[fcode] != 0) ent = ftable[fcode]; else { output ((code_int) ent); out_count++; ent = c; if (free_ent >= maxmaxcode) { if ((count_int) in_count < checkpoint || block_compress == 0) continue; else { clear(); i = 0; } } else { /* * put code in table and * memorize for block compression. */ ftable[fcode] = (short) free_ent++; fcodemem[i++] = fcode; #ifdef DEBUG if (free_ent > largecode) largecode = free_ent; #endif } } } goto finish; } /* * Strictly speaking, the following is an "else" clause. */ #endif #if IS_CACHE chog = CHOG; /* assumed character for the hog */ cstat_flg = FALSE; #endif hsize_reg = hsize; cl_hash(hsize_reg); /* clear hash tables */ while ((c = GET()) != (unsigned) EOF) { in_count++; #if IS_CACHE if (!cstat_flg) { cfreq[c]++; /* gather frequencies at start of input */ if ((count_int) in_count > HOG_CHECK) { cstat_flg = TRUE; chog = hogtally(); /* compute char hog */ if (chog != CHOG) /* fixup for wrong assumption */ creset((count_int) free_ent); } } if (c == chog) { if ((i = hashcache[ent])) { /* cache -> code */ ent = i; continue; } } #endif fcode = (long) (((long) c << maxbits) + ent); #ifdef SHORT_INT /* * Avoid long modulus */ i = (((c + 12347) * ent) & 077777) % HSIZE; #else i = fcode % hsize_reg; /* division hashing */ #endif if (htab[i] == fcode) { ent = codetab[i]; continue; } else if ((long) htab[i] < 0) /* empty slot */ goto nomatch; disp = hsize_reg - i; /* secondary hash (G. Knott) */ if (i == 0) disp = 1; initial_probe = i; /* Remember for full test */ probe: if ((i -= disp) < 0) i += hsize_reg; if (htab[i] == fcode) { ent = codetab[i]; continue; } if ((long) htab[i] > 0) { if (i == initial_probe) { fprintf(stderr, "Hash table full, get help\n"); abort(); } goto probe; } nomatch: output((code_int) ent); out_count++; #ifdef interdata if ((unsigned) free_ent < (unsigned) maxmaxcode) { #else if (free_ent < maxmaxcode) { #endif #if IS_CACHE if (c == chog) /* code -> cache */ hashcache[ent] = free_ent; #endif codetab[i] = free_ent++; /* code -> hashtable */ #ifdef DEBUG if (free_ent > largecode) largecode = free_ent; #endif htab[i] = fcode; } else if ((count_int) in_count >= checkpoint && block_compress != 0) { clear(); } ent = c; } finish: /* * Put out the final code. */ output((code_int) ent); out_count++; if (block_compress != 0) { /* * On RT11 and similar operating systems, the last * physical data block is null-filled to some defined * size. Code in lzdcm2.c looks for two CLEAR's in * a row and exits if seen. The -1 following may also * be an exit code. */ sendclear(); sendclear(); } output((code_int) -1); /* EOF signal */ /* * Print out stats on stderr */ if (!quiet) { #ifdef DEBUG fprintf(stderr, "%ld chars in, %ld codes (%ld bytes) out, ", in_count, out_count, bytes_out); divout("compression ratio", in_count, bytes_out, "\n"); divout(" Compression as in compact", (in_count - bytes_out) * 100, in_count, "%\n"); fprintf(stderr, " Largest code was %d (%d bits),", largecode - 1, largebits); fprintf(stderr, " %d clear code%s sent.\n", clearcount, (clearcount == 1) ? "" : "s"); #else divout("Compression", (in_count - bytes_out) * 100, in_count, "%\n"); #endif } if (bytes_out > in_count) { fprintf(stderr, "Note: output file (%ld bytes) > input (%ld bytes)\n", bytes_out, in_count); return (FALSE); /* Tell the caller */ } return (TRUE); } static divout(leader, numer, denom, trailer) char *leader; long numer; long denom; char *trailer; /* * Print numer / denom without loading floating printout routines. */ { numer = (numer * 100) / denom; fprintf(stderr, "%s: %ld.%02ld%s", leader, numer / 100, numer % 100, trailer); } /* * Output a given code. */ static char_type buf[BITS]; #if !vax_asm char_type lmask[9] = {0xFF, 0xFE, 0xFC, 0xF8, 0xF0, 0xE0, 0xC0, 0x80, 0x00}; char_type rmask[9] = {0x00, 0x01, 0x03, 0x07, 0x0F, 0x1F, 0x3F, 0x7F, 0xFF}; #endif #ifdef DEBUG static int col = 0; #endif output(code) code_int code; /* * Output the given code. * Inputs: * code: A n_bits-bit integer. If == -1, then EOF. This assumes * that n_bits =< (long)wordsize - 1. * Outputs: * Outputs code to the file. * Assumptions: * Chars are 8 bits long. * Algorithm: * Maintain a BITS character long buffer (so that 8 codes will * fit in it exactly). Use the VAX insv instruction to insert each * code in turn. When the buffer fills up empty it and start over. */ { /* * On the VAX (Unix), it is important to have the register declarations * in exactly the order given, or the asm will break. */ register int r_off, bits; register char_type *bp; r_off = offset; bits = n_bits; bp = buf; if (code >= 0) { /* * Not at EOF, add the code */ #ifdef DEBUG if (debug > 1) { fprintf(stderr, "%5d%c", code, (col += 6) >= 74 ? (col = 0, '\n') : ' '); } #endif #if vax_asm /* * VAX DEPENDENT!! Implementation on other machines may be * difficult. * * Translation: Insert BITS bits from the argument starting at * offset bits from the beginning of buf. */ 0; /* C compiler bug ?? */ asm("insv 4(ap),r11,r10,(r9)"); #else /* * WARNING: byte/bit numbering on the vax is simulated * by the following code * * Get to the first byte. */ bp += (r_off >> 3); r_off &= 7; /* * Since code is always >= 8 bits, only need to mask the first * hunk on the left. */ *bp = (*bp & rmask[r_off]) | (code << r_off) & lmask[r_off]; bp++; bits -= (8 - r_off); code >>= 8 - r_off; /* * Get any 8 bit parts in the middle ( <= 1 for up to 16 bits). */ if (bits >= 8) { *bp++ = code; code >>= 8; bits -= 8; } if (bits != 0) /* Last bits. */ *bp = code; #endif offset += n_bits; if (offset == (n_bits << 3)) { bytes_out += n_bits; PUTBUF(buf, n_bits); offset = 0; } /* * If the next entry is going to be too big for the code size, * then increase it, if possible. */ if (free_ent > maxcode || clear_flg) { /* * Write the whole buffer, because the input side won't * discover the size increase until after it has read it. */ if (offset > 0) { PUTBUF(buf, n_bits); bytes_out += n_bits; } offset = 0; if (clear_flg) { n_bits = INIT_BITS; maxcode = MAXCODE(n_bits); clear_flg = FALSE; } else { n_bits++; if (n_bits == maxbits) maxcode = maxmaxcode; else maxcode = MAXCODE(n_bits); #ifdef DEBUG if (n_bits > largebits) largebits = n_bits; #endif } #ifdef DEBUG if (debug != 0) { fprintf(stderr, "\nChange to %d bits\n", n_bits); col = 0; } #endif } } else { /* * At EOF, write the rest of the buffer. */ if (offset > 0) { PUTBUF(buf, ((offset + 7) / 8)); bytes_out += (offset + 7) / 8; } offset = 0; LZ_FLUSH(); /* Flush output buffer */ #ifdef DEBUG if (debug > 1) fprintf(stderr, "\n"); #endif } } clear() /* * Check the compression ration to see whether it is going up * or staying the same. If it is going down, the internal * statistics of the file have changed, so clear out our * tables and start over. Inform the decompressor of the * change by sending out a CLEAR code. */ { register code_int i; register count_int *p, *endp; register U_short *q; #ifdef DEBUG if (debug != 0) { fprintf (stderr, "clear: count: %ld", in_count); divout(" ratio", in_count, bytes_out, "\n"); } #endif checkpoint = in_count + CHECK_GAP; if ((double) in_count / (double) bytes_out > ratio) ratio = (double) in_count / (double) bytes_out; else { /* * Compression ratio has gone down, start again. * The most likely reason for this is that the * characteristics of the file have changed. */ ratio = 0.0; #if USERMEM > 0 if (using_fast) { /* sparse array clear */ for (i = (1 << maxbits) - 1; i >= 0; i--) ftable[fcodemem[i]] = 0; /* indirect thru "shadow" */ } else #endif { /* * Clear hash table */ endp = &htab[hsize]; for (p = &htab[0], q = &codetab[0]; p < endp;) { *p++ = -1; *q++ = 0; } #if IS_CACHE creset(MAX_CACHE); #endif } sendclear(); #ifdef DEBUG clearcount++; #endif } } sendclear() /* * Put a clear code into the output stream. */ { free_ent = FIRST; clear_flg = TRUE; output((code_int) CLEAR); out_count++; } #if IS_CACHE static flag nfiles = FALSE; creset(n) register count_int n; /* clear at least this many entries */ /* * clear hash cache */ { register count_int i; register U_short *hash_p; register U_short zero; if (!nfiles) { /* No clear needed if first time */ nfiles = TRUE; return; } zero = 0; n = (n + 15) & (~15); /* Was & (-16) */ hash_p = hashcache + n; for (i = n; i > 0; i -=16) { *(hash_p-16) = zero; *(hash_p-15) = zero; *(hash_p-14) = zero; *(hash_p-13) = zero; *(hash_p-12) = zero; *(hash_p-11) = zero; *(hash_p-10) = zero; *(hash_p-9) = zero; *(hash_p-8) = zero; *(hash_p-7) = zero; *(hash_p-6) = zero; *(hash_p-5) = zero; *(hash_p-4) = zero; *(hash_p-3) = zero; *(hash_p-2) = zero; *(hash_p-1) = zero; hash_p -= 16; } } hogtally() /* compute character code hog */ { register int i, most; for (i = most = 0; i < 256; i++) { if (cfreq[i] >= cfreq[most]) most = i; } return (most); } #endif cl_hash(h_size) register code_int h_size; { register count_int *htab_p; register int i; register long m1; htab_p = htab + h_size; m1 = -1; #if IS_CACHE /* clear hashcache */ #define min(a,b) ((a > b) ? b : a) creset(min((count_int) h_size, MAX_CACHE)); #endif i = h_size - 16; do { *(htab_p-16) = m1; *(htab_p-15) = m1; *(htab_p-14) = m1; *(htab_p-13) = m1; *(htab_p-12) = m1; *(htab_p-11) = m1; *(htab_p-10) = m1; *(htab_p-9) = m1; *(htab_p-8) = m1; *(htab_p-7) = m1; *(htab_p-6) = m1; *(htab_p-5) = m1; *(htab_p-4) = m1; *(htab_p-3) = m1; *(htab_p-2) = m1; *(htab_p-1) = m1; htab_p -= 16; } while ((i -= 16) >= 0); for (i += 16; i > 0; i--) *--htab_p = m1; }