String compression in JavaScript

I'm looking for a JavaScript function that given a string returns a compressed (shorter) string.

I'm developing a Chrome web application that saves long strings (HTML) to a local database. For testing purposes I tried to zip the file storing the database, and it shrank by a factor of five, so I figured it would help keep the database smaller if I compressed the things it stores.

I've found an implementation of LZSS in JavaScript here: http://code.google.com/p/u-lzss/ ("U-LZSS").

It seemed to work when I tested it "by hand" with short example strings (decode === encode), and it's reasonably fast too, in Chrome. But when given big strings (100 ko) it seems to garble/mix up the last half of the string.

Is it possible that U-LZSS expects short strings and can't deal with larger strings? And would it be possible to adjust some parameters in order to move that upper limit?

I just released a small LZW implementation especially tailored for this very purpose as none of the existing implementations did meet my needs.

Here is the home page of the project
Here is a link to a demo comparing it with LZMA level 1

That's what I'm using going forward, and I will probably try to improve the library at some point.

Here are encode (276 bytes, function en) and decode (191 bytes, function de) functions I modded from LZW in a fully working demo. There is no smaller or faster routine available on the internet than what I am giving you here.

function en(c){var x='charCodeAt',b,e={},f=c.split(""),d=[],a=f[0],g=256;for(b=1;b<f.length;b++)c=f[b],null!=e[a+c]?a+=c:(d.push(1<a.length?e[a]:a[x](0)),e[a+c]=g,g++,a=c);d.push(1<a.length?e[a]:a[x](0));for(b=0;b<d.length;b++)d[b]=String.fromCharCode(d[b]);return d.join("")}

function de(b){var a,e={},d=b.split(""),c=f=d[0],g=[c],h=o=256;for(b=1;b<d.length;b++)a=d[b].charCodeAt(0),a=h>a?d[b]:e[a]?e[a]:f+c,g.push(a),c=a.charAt(0),e[o]=f+c,o++,f=a;return g.join("")}

var compressed=en("http://www.ScriptCompress.com - Simple Packer/Minify/Compress JavaScript Minify, Fixify & Prettify 75 JS Obfuscators In 1 App 25 JS Compressors (Gzip, Bzip, LZMA, etc) PHP, HTML & JS Packers In 1 App PHP Source Code Packers Text Packer HTML Packer or v2 or v3 or LZW Twitter Compress or More Words DNA & Base64 Packer (freq tool) or v2 JS JavaScript Code Golfer Encode Between Quotes Decode Almost Anything Password Protect Scripts HTML Minifier v2 or Encoder or Escaper CSS Minifier or Compressor v2 SVG Image Shrinker HTML To: SVG or SVGZ (Gzipped) HTML To: PNG or v2 2015 JS Packer v2 v3 Embedded File Generator Extreme Packer or version 2 Our Blog DemoScene JS Packer Basic JS Packer or New Version Asciify JavaScript Escape JavaScript Characters UnPacker Packed JS JavaScript Minify/Uglify Text Splitter/Chunker Twitter, Use More Characters Base64 Drag 'n Drop Redirect URL DataURI Get Words Repeated LZMA Archiver ZIP Read/Extract/Make BEAUTIFIER & CODE FIXER WHAK-A-SCRIPT JAVASCRIPT MANGLER 30 STRING ENCODERS CONVERTERS, ENCRYPTION & ENCODERS 43 Byte 1px GIF Generator Steganography PNG Generator WEB APPS VIA DATAURL OLD VERSION OF WHAK PAKr Fun Text Encrypt Our Google");
var decompressed=de(compressed);

document.writeln('<hr>'+compressed+'<hr><h1>'+compressed.length+' characters versus original '+decompressed.length+' characters.</h1><hr>'+decompressed+'<hr>');

It seems, there is a proposal of compression/decompression API: https://github.com/wicg/compression/blob/master/explainer.md .

And it is implemented in Chrome 80 (right now in Beta) according to a blog post at https://blog.chromium.org/2019/12/chrome-80-content-indexing-es-modules.html .

I am not sure I am doing a good conversion between streams and strings, but here is my try to use the new API:

function compress(string, encoding) {
  const byteArray = new TextEncoder().encode(string);
  const cs = new CompressionStream(encoding);
  const writer = cs.writable.getWriter();
  writer.write(byteArray);
  writer.close();
  return new Response(cs.readable).arrayBuffer();
}

function decompress(byteArray, encoding) {
  const cs = new DecompressionStream(encoding);
  const writer = cs.writable.getWriter();
  writer.write(byteArray);
  writer.close();
  return new Response(cs.readable).arrayBuffer().then(function (arrayBuffer) {
    return new TextDecoder().decode(arrayBuffer);
  });
}

const test = "http://www.ScriptCompress.com - Simple Packer/Minify/Compress JavaScript Minify, Fixify & Prettify 75 JS Obfuscators In 1 App 25 JS Compressors (Gzip, Bzip, LZMA, etc) PHP, HTML & JS Packers In 1 App PHP Source Code Packers Text Packer HTML Packer or v2 or v3 or LZW Twitter Compress or More Words DNA & Base64 Packer (freq tool) or v2 JS JavaScript Code Golfer Encode Between Quotes Decode Almost Anything Password Protect Scripts HTML Minifier v2 or Encoder or Escaper CSS Minifier or Compressor v2 SVG Image Shrinker HTML To: SVG or SVGZ (Gzipped) HTML To: PNG or v2 2015 JS Packer v2 v3 Embedded File Generator Extreme Packer or version 2 Our Blog DemoScene JS Packer Basic JS Packer or New Version Asciify JavaScript Escape JavaScript Characters UnPacker Packed JS JavaScript Minify/Uglify Text Splitter/Chunker Twitter, Use More Characters Base64 Drag 'n Drop Redirect URL DataURI Get Words Repeated LZMA Archiver ZIP Read/Extract/Make BEAUTIFIER & CODE FIXER WHAK-A-SCRIPT JAVASCRIPT MANGLER 30 STRING ENCODERS CONVERTERS, ENCRYPTION & ENCODERS 43 Byte 1px GIF Generator Steganography PNG Generator WEB APPS VIA DATAURL OLD VERSION OF WHAK PAKr Fun Text Encrypt Our Google";

async function testCompression(text, encoding = 'deflate') {
  console.log(encoding + ':');
  console.time('compress');
  const compressedData = await compress(text, encoding);
  console.timeEnd('compress');
  console.log('compressed length:', compressedData.byteLength, 'bytes');
  console.time('decompress');
  const decompressedText = await decompress(compressedData, encoding);
  console.timeEnd('decompress');
  console.log('decompressed length:', decompressedText.length, 'characters');
  console.assert(text === decompressedText);
}

(async function () {
  await testCompression(test, 'deflate');
  await testCompression(test, 'gzip');
}());

To me it doesn't seem reasonable to compress a string using UTF-8 as the destination... It looks like just looking for trouble. I think it would be better to lose some compression and using plain 7-bit ASCII as the destination if over-the-wire size is important.

If the storage limit is based on UTF-16 characters then a large safe subset could be looked for if you care about escaping or UTF-16 compliance or you could just try to use each char as 0..65535 if everything else involved (e.g. databases) don't have problems. Most software layers should have no problems with that (ab)use but note that in UTF-16 range 0xD800-0xDFFF is reserved for a special use (surrogate pairs) so some combinations are formally "encoding errors" and could in theory be stopped or distorted.

In a toy 4 KB JavaScript demo I wrote for fun I used an encoding for the result of compression that stores four binary bytes into five chars chosen from a subset of ASCII of 85 chars that is clean for embedding in a JavaScript string (85^5 is slightly more than 8^4, but still fits in the precision of JavaScript integers). This makes compressed data safe for example for JSON without need of any escaping.

In code the following builds the list of 85 "safe" characters:

let cset = "";
for (let i=35; i<35+85+1; i++) {
    if (i !== 92) cset += String.fromCharCode(i);
}

Then to encode 4 bytes (b0, b1, b2 and b3 each from 0...255) into 5 characters the code is:

// First convert to 0...4294967295
let x = ((b0*256 + b1)*256 + b2)*256 + b3;

// Then convert to base 85
let result = "";
for (let i=0; i<5; i++) {
    let x2 = Math.floor(x / 85);
    result += cset[x - x2*85];
    x = x2;
}

To decode you do the reverse, i.e. compute x from the base-85 number and then extract the 4 base-256 digits (i.e. the bytes).

NOTE: in the torus code I used a slightly different charset, instead of skipping 92 \ I replaced it with 126 ~. For who is interested the full decompression code is

// There are two Huffman-encoded code streams
//    T - single chars (0..127) and sequence lengths (128...255)
//    A - high bits of relative addresses of sequence (0..255)
//
// Expansion algorithm is:
//    1) Read a code X from T
//    2) If it's a char (X < 128) then add to output
//    3) otherwise (X>=128) read sequence address ADDR from stream A (high bits)
//       and from input (low bits) and copy X-128 bytes from ADDR bytes "ago"
//

let Z = 5831; // expanded size
let i = 0, // source ptr
    a = 0, // current bits accumulator
    n = 0; // number of available bits in a

// Read a single bit
let b = function(){
    if (!n) {
        // There are no more bits available in the accumulator, read a new chunk:
        // 5 ASCII escape-safe chars will be transformed in 4 8-bit binary bytes
        // (like BASE64, just a bit more dense)
        a = 0;
        let w = 5;
        while (w--) {
            let y = s.charCodeAt(i+w);          // get next char
            a = a*85 + (y > 125 ? 92 : y) - 35; // extract base-85 "digit" (note, uses ~ instead of \ that requires quoting)
        }
        n = 32; // we got 32 bits in a
        i += 5; // we consumed 5 characters from source
    }
    return (a >> --n) & 1;  // extract a single bit
};

// Read a code of z bits by concatenating bits coming from b()
let v = function(z){
    return (--z ? v(z) : 0)*2+b();
};

// Read an Huffman (sub-)tree: a bit will tell if we need to
// read a two sub-trees or a leaf
let h = function(){
    return b() ? [h(), h()] : v(8);
};

// Read A and T Huffman trees
let A = h(), T = h();

// Extract a code given a node:
//   if the node is an array (intermediate node) then we need to read a bit
//   from the input binary stream to decide which way to go down the tree,
//   if it's a number then we just return the value.
//   `n.map` is truthy for arrays and falsy for numbers.
let d = function(n){
    return n.map ? d(n[b()]) : n;
};

let S="";  // Output

// While we're not done
while(S.length<Z){
    // Extract a code from T
    x = d(T);
    if (x < 128) {
        // This is a single character, copy to output
        S += String.fromCharCode(x);
    } else {
        // This is a sequence of x-128 bytes, get address and copy it
        // Note: high 8 bits are from the Huffman tree A and 8 low bits
        // are instead directly form the bit stream as they're basically
        // noise and there's nothing to gain by trying to compress them.
        S += S.substr(S.length-(d(A)<<8)-v(8), x-128)
    };
}

(note that I dind't test this reformatted/commented version, typos may be present)

String compression in JavaScript

Related

Recent Posts