From 4105074e213e9d2a9bd78a3009706b282f139b7f Mon Sep 17 00:00:00 2001 From: David Junger Date: Mon, 24 May 2021 11:41:20 +0200 Subject: [PATCH 1/3] pre-compute 8kB table and implement slice-by-4 --- src/crc32.ts | 2 +- src/crc32.wat | 72 +++++++++++++++++++++++++++++++++++++++++++--- test/crc32.test.ts | 5 ++-- 3 files changed, 71 insertions(+), 8 deletions(-) diff --git a/src/crc32.ts b/src/crc32.ts index 7bdb6bb..3612fdf 100644 --- a/src/crc32.ts +++ b/src/crc32.ts @@ -1,6 +1,6 @@ import { makeUint8Array } from "./utils.ts" -const wasm = "AGFzbQEAAAABCgJgAABgAn9/AXwDAwIAAQUDAQACBw0DAW0CAAF0AAABYwABCpUBAkkBA38DQCABIQBBACECA0AgAEEBdiAAQQFxQaCG4u1+bHMhACACQQFqIgJBCEcNAAsgAUECdCAANgIAIAFBAWoiAUGAAkcNAAsLSQEBfyABQX9zIQFBgIAEIQJBgIAEIABqIQADQCABQf8BcSACLQAAc0ECdCgCACABQQh2cyEBIAJBAWoiAiAASQ0ACyABQX9zuAs" +const wasm = "AGFzbQEAAAABCgJgAABgAn9/AXwDAwIAAQUDAQACBw0DAW0CAAF0AAABYwABCt0CApgBAQN/A0AgASEAQQAhAgNAIABBAXYgAEEBcUGghuLtfmxzIQAgAkEBaiICQQhHDQALIAFBAnQgADYCACABQQFqIgFBgAJHDQALQQAhAQNAQQAhAgNAIAIgAXIoAgAiAEH/AXFBAnQoAgAgAEEIdnMhACACQYAIaiICIAFyIAA2AgAgAkGAOEcNAAsgAUEEaiIBQYAIRw0ACwvAAQECfyABQX9zIQFBgIAEIQJBgIAEIABqIgBBAnVBAnQiA0GEgARPBEADQCABIAIoAgBzIQEgAUEYdkECdCgCACABQRB2Qf8BcUECdEGACHIoAgBzIAFBCHZB/wFxQQJ0QYAQcigCAHMgAUH/AXFBAnRBgBhyKAIAcyEBIAJBBGoiAiADRw0ACwsgAiAASQRAA0AgAUH/AXEgAi0AAHNBAnQoAgAgAUEIdnMhASACQQFqIgIgAEkNAAsLIAFBf3O4Cw" const instance = new WebAssembly.Instance( new WebAssembly.Module(Uint8Array.from(atob(wasm), c => c.charCodeAt(0))) diff --git a/src/crc32.wat b/src/crc32.wat index 78e7593..ebeed39 100644 --- a/src/crc32.wat +++ b/src/crc32.wat @@ -20,13 +20,77 @@ (local.tee $i (i32.add (local.get $i) (i32.const 1))) (br_if 0 (i32.ne (i32.const 0x100))) ) + (local.set $i (i32.const 0)) + (loop ;; now compute 7 more tables for slice-by-8 + (local.set $j (i32.const 0)) + (loop + (i32.load (i32.or (local.get $j) (local.get $i))) + local.tee $crc + + (i32.and (i32.const 0xFF)) + (i32.shl (i32.const 2)) + i32.load + + (i32.shr_u (local.get $crc) (i32.const 8)) + (local.set $crc (i32.xor)) + + (local.tee $j (i32.add (local.get $j) (i32.const 0x400))) ;; j++ + (i32.or (local.get $i)) + + local.get $crc + i32.store + (br_if 0 (i32.ne (local.get $j) (i32.const 0x1C00))) + ) + (local.tee $i (i32.add (local.get $i) (i32.const 4))) + (br_if 0 (i32.ne (i32.const 0x400))) + ) ;; in total, the tables occupy the first 8 kB of the second mem page ) ;; this computes the CRC32 of what you put in the module's second page of Memory ;; (do not overwrite the first page!) - (func $crc32 (param $len i32) (param $crc i32) (result f64) (local $i i32) + (func $crc32 (param $len i32) (param $crc i32) (result f64) (local $i i32) (local $l8 i32) (local.set $crc (i32.xor (local.get $crc) (i32.const -1))) (local.set $i (i32.const 0x10000)) - (local.set $len (i32.add (i32.const 0x10000) (local.get $len))) + (local.tee $len (i32.add (i32.const 0x10000) (local.get $len))) + i32.const 2 + i32.shr_s + i32.const 2 + i32.shl + local.tee $l8 + + (if (i32.ge_u (i32.const 0x10004)) + (loop + (local.set $crc (i32.xor (local.get $crc) (i32.load (local.get $i)))) + + (i32.shr_u (local.get $crc) (i32.const 24)) + (i32.shl (i32.const 2)) + i32.load + + (i32.shr_u (local.get $crc) (i32.const 16)) + (i32.and (i32.const 0xFF)) + (i32.shl (i32.const 2)) + (i32.or (i32.const 0x0400)) + i32.load + i32.xor + + (i32.shr_u (local.get $crc) (i32.const 8)) + (i32.and (i32.const 0xFF)) + (i32.shl (i32.const 2)) + (i32.or (i32.const 0x0800)) + i32.load + i32.xor + + (i32.and (local.get $crc) (i32.const 0xFF)) + (i32.shl (i32.const 2)) + (i32.or (i32.const 0x0C00)) + i32.load + i32.xor + + local.set $crc ;; store the updated CRC + (local.tee $i (i32.add (local.get $i) (i32.const 4))) + (br_if 0 (i32.ne (local.get $l8))) + )) + + (if (i32.lt_u (local.get $i) (local.get $len)) (loop (i32.and (local.get $crc) (i32.const 0xFF)) (i32.load8_u (local.get $i)) @@ -38,10 +102,10 @@ local.set $crc ;; store the updated CRC (local.tee $i (i32.add (local.get $i) (i32.const 1))) (br_if 0 (i32.lt_u (local.get $len))) - ) + )) (i32.xor (local.get $crc) (i32.const -1)) f64.convert_i32_u ;; return a positive Number ) (export "t" (func $genTable)) (export "c" (func $crc32)) -) \ No newline at end of file +) diff --git a/test/crc32.test.ts b/test/crc32.test.ts index d28c4e1..3d68ba3 100644 --- a/test/crc32.test.ts +++ b/test/crc32.test.ts @@ -4,9 +4,8 @@ import { crc32, memory } from "../src/crc32.ts" const table = await Deno.readFile("./test/table.array") Deno.test("the CRC32 module precomputes CRCs for each byte using the polynomial 0xEDB88320", () => { - const actual = new Uint8Array(memory.buffer.slice(0, 0x0400)) - const expected = table.slice(0, 0x400) - assertEquals(actual, expected) + const actual = new Uint8Array(memory.buffer.slice(0, 0x2000)) + assertEquals(actual, table) }) Deno.test("the CRC32 for an empty file", () => { From 9e9a0df3a9685c0b98d2af86df66803e3fe52824 Mon Sep 17 00:00:00 2001 From: David Junger Date: Mon, 24 May 2021 13:16:59 +0200 Subject: [PATCH 2/3] use SIMD --- src/crc32.ts | 2 +- src/crc32.wat | 42 ++++++++++++++++++++---------------------- test/crc32.test.ts | 5 +++-- 3 files changed, 24 insertions(+), 25 deletions(-) diff --git a/src/crc32.ts b/src/crc32.ts index 3612fdf..c8a201a 100644 --- a/src/crc32.ts +++ b/src/crc32.ts @@ -1,6 +1,6 @@ import { makeUint8Array } from "./utils.ts" -const wasm = "AGFzbQEAAAABCgJgAABgAn9/AXwDAwIAAQUDAQACBw0DAW0CAAF0AAABYwABCt0CApgBAQN/A0AgASEAQQAhAgNAIABBAXYgAEEBcUGghuLtfmxzIQAgAkEBaiICQQhHDQALIAFBAnQgADYCACABQQFqIgFBgAJHDQALQQAhAQNAQQAhAgNAIAIgAXIoAgAiAEH/AXFBAnQoAgAgAEEIdnMhACACQYAIaiICIAFyIAA2AgAgAkGAOEcNAAsgAUEEaiIBQYAIRw0ACwvAAQECfyABQX9zIQFBgIAEIQJBgIAEIABqIgBBAnVBAnQiA0GEgARPBEADQCABIAIoAgBzIQEgAUEYdkECdCgCACABQRB2Qf8BcUECdEGACHIoAgBzIAFBCHZB/wFxQQJ0QYAQcigCAHMgAUH/AXFBAnRBgBhyKAIAcyEBIAJBBGoiAiADRw0ACwsgAiAASQRAA0AgAUH/AXEgAi0AAHNBAnQoAgAgAUEIdnMhASACQQFqIgIgAEkNAAsLIAFBf3O4Cw" +const wasm = "AGFzbQEAAAABCgJgAABgAn9/AXwDAwIAAQUDAQACBw0DAW0CAAF0AAABYwABCusCApgBAQN/A0AgASEAQQAhAgNAIABBAXYgAEEBcUGghuLtfmxzIQAgAkEBaiICQQhHDQALIAFBAnQgADYCACABQQFqIgFBgAJHDQALQQAhAQNAQQAhAgNAIAIgAXIoAgAiAEH/AXFBAnQoAgAgAEEIdnMhACACQYAIaiICIAFyIAA2AgAgAkGAOEcNAAsgAUEEaiIBQYAIRw0ACwvOAQICfwF7IAFBf3MhAUGAgAQhAkGAgAQgAGoiAEECdUECdCIDQYSABE8EQANAIAEgAigCAHP9Ef0MA////wL///8B////AP////0OQQL9qwH9DAAAAAAABAAAAAgAAAAMAAD9UCIE/RsAKAIAIAT9GwEoAgBzIAT9GwIoAgBzIAT9GwMoAgBzIQEgAkEEaiICIANHDQALCyACIABJBEADQCABQf8BcSACLQAAc0ECdCgCACABQQh2cyEBIAJBAWoiAiAASQ0ACwsgAUF/c7gL" const instance = new WebAssembly.Instance( new WebAssembly.Module(Uint8Array.from(atob(wasm), c => c.charCodeAt(0))) diff --git a/src/crc32.wat b/src/crc32.wat index ebeed39..57b7ad1 100644 --- a/src/crc32.wat +++ b/src/crc32.wat @@ -21,7 +21,7 @@ (br_if 0 (i32.ne (i32.const 0x100))) ) (local.set $i (i32.const 0)) - (loop ;; now compute 7 more tables for slice-by-8 + (loop ;; now compute 3 more tables for slice-by-4 (local.set $j (i32.const 0)) (loop (i32.load (i32.or (local.get $j) (local.get $i))) @@ -39,15 +39,15 @@ local.get $crc i32.store - (br_if 0 (i32.ne (local.get $j) (i32.const 0x1C00))) + (br_if 0 (i32.ne (local.get $j) (i32.const 0xC00))) ) (local.tee $i (i32.add (local.get $i) (i32.const 4))) (br_if 0 (i32.ne (i32.const 0x400))) - ) ;; in total, the tables occupy the first 8 kB of the second mem page + ) ;; in total, the tables occupy the first 4 kB of the first mem page ) ;; this computes the CRC32 of what you put in the module's second page of Memory ;; (do not overwrite the first page!) - (func $crc32 (param $len i32) (param $crc i32) (result f64) (local $i i32) (local $l8 i32) + (func $crc32 (param $len i32) (param $crc i32) (result f64) (local $i i32) (local $l8 i32) (local $v v128) (local.set $crc (i32.xor (local.get $crc) (i32.const -1))) (local.set $i (i32.const 0x10000)) (local.tee $len (i32.add (i32.const 0x10000) (local.get $len))) @@ -59,30 +59,28 @@ (if (i32.ge_u (i32.const 0x10004)) (loop - (local.set $crc (i32.xor (local.get $crc) (i32.load (local.get $i)))) + (i32.xor (local.get $crc) (i32.load (local.get $i))) - (i32.shr_u (local.get $crc) (i32.const 24)) - (i32.shl (i32.const 2)) - i32.load + i32x4.splat + v128.const i32x4 0xFFFFFF03 0xFFFFFF02 0xFFFFFF01 0xFFFFFF00 + i8x16.swizzle ;; this was called 'v8x16.swizzle' if you have an older WABT - (i32.shr_u (local.get $crc) (i32.const 16)) - (i32.and (i32.const 0xFF)) - (i32.shl (i32.const 2)) - (i32.or (i32.const 0x0400)) - i32.load + (i32x4.shl (i32.const 2)) + v128.const i32x4 0x00000000 0x00000400 0x00000800 0x00000C00 + v128.or + local.tee $v + (i32.load (i32x4.extract_lane 0)) + + local.get $v + (i32.load (i32x4.extract_lane 1)) i32.xor - (i32.shr_u (local.get $crc) (i32.const 8)) - (i32.and (i32.const 0xFF)) - (i32.shl (i32.const 2)) - (i32.or (i32.const 0x0800)) - i32.load + local.get $v + (i32.load (i32x4.extract_lane 2)) i32.xor - (i32.and (local.get $crc) (i32.const 0xFF)) - (i32.shl (i32.const 2)) - (i32.or (i32.const 0x0C00)) - i32.load + local.get $v + (i32.load (i32x4.extract_lane 3)) i32.xor local.set $crc ;; store the updated CRC diff --git a/test/crc32.test.ts b/test/crc32.test.ts index 3d68ba3..3b805c5 100644 --- a/test/crc32.test.ts +++ b/test/crc32.test.ts @@ -4,8 +4,9 @@ import { crc32, memory } from "../src/crc32.ts" const table = await Deno.readFile("./test/table.array") Deno.test("the CRC32 module precomputes CRCs for each byte using the polynomial 0xEDB88320", () => { - const actual = new Uint8Array(memory.buffer.slice(0, 0x2000)) - assertEquals(actual, table) + const actual = new Uint8Array(memory.buffer).subarray(0, 0x1000) + const expected = table.subarray(0, 0x1000) + assertEquals(actual, expected) }) Deno.test("the CRC32 for an empty file", () => { From c00ff2b010606c5834cc229766a0588fb438ea74 Mon Sep 17 00:00:00 2001 From: David Junger Date: Sat, 10 Jul 2021 09:46:52 +0200 Subject: [PATCH 3/3] start with the basic implementation and load SIMD asynchronously --- README.md | 31 +++++++++++---- crc32x4.wasm | Bin 0 -> 338 bytes index.d.ts | 4 ++ src/crc32.ts | 25 ++++++++---- src/crc32.wat | 68 ++------------------------------- src/crc32x4.wat | 93 +++++++++++++++++++++++++++++++++++++++++++++ src/index.ts | 1 + src/utils.ts | 1 + src/worker.ts | 3 +- test/crc32.test.ts | 22 +++++++++-- 10 files changed, 164 insertions(+), 84 deletions(-) create mode 100644 crc32x4.wasm create mode 100644 src/crc32x4.wat diff --git a/README.md b/README.md index 0323232..14e170c 100644 --- a/README.md +++ b/README.md @@ -61,7 +61,7 @@ When necessary, client-zip will generate Zip64 archives. Those are not readable # Usage -The module exports (and the worker script globally defines) a single function: +The module exports (and the worker script globally defines) this function: ```typescript function downloadZip(files: ForAwaitable): Response ``` @@ -78,6 +78,21 @@ The function returns a `Response` immediately. You don't need to wait for the wh Unless your list of inputs is quite small, you should prefer generators (when zipping Files or other resources that are already available) and async generators (when zipping Responses so you can `fetch` them lazily, or other resources that are generated last-minute so you don't need to store them longer than necessary) to provide the inputs to `downloadZip`. +### Setting up vector processing for CRC32 + +There is a second export from the module: +```typescript +function useSimd(url?: string | URL): Promise +``` + +**Warning**: it may reject! You should catch errors, and log them in dev mode until you fix the URL or hosting configuration. + +When you call that function, it will feature test for SIMD instructions in WebAssembly, and if that succeeds, it will attempt to replace the CRC32 module with another one that's almost 30% faster (in Deno, at least), for all current and future uses of `downloadZip`. + +You may call `useSimd` at any point (but preferably early and only once). By default it will look for a file named "crc32x4.wasm" in the same location as `import.meta.url`, which works when loading the library from a CDN but not necessarily if you bundle the library code into your app or "vendor" script. That's why you can pass a URL argument to `useSimd` (it will fetch the WASM from there instead ; depending on your setup you might want to copy the file to some static storage, or point to a CDN). + +The IIFE worker script does *not* expose `useSimd` ; instead it calls it immediately with no argument. So if you host "worker.js", make sure "crc32x4.wasm" is served right next to it. + # Benchmarks I started this project because I wasn't impressed with what appeared to be the only other ZIP library for browsers, [JSZip](https://stuk.github.io/jszip/). The JSZip website acknowledges its performance limitations, but now we can actually quantify them. I later found other libraries, which I've included in the new benchmarks. @@ -96,20 +111,22 @@ The experiments were run about 10 times for each lib and each dataset with a few *For the baseline, I timed the `zip` process in my UNIX shell — clearly there is much room for improvement. -The files were served over HTTP/1.1 by nginx running on localhost, with cache enabled (not that it makes a difference). The overhead of HTTP (not network, just having to go through the layers) really shows in the dataset with 12k files. +The files were served over HTTP by nginx running on localhost, with cache enabled (not that it makes a difference). + +It's interesting that Chrome performs so much worse than Safari with client-zip and conflux, the two libraries that rely on WHATWG Streams and (in my case) async iterables, whereas it shows better runtimes with fflate (slightly) and JSZip (by a lot, though it may be a fluke as I did not repeat the experiment), both of which use synchronous code with callbacks. Shame on you, Chrome. -It's interesting that Chrome performs so much worse than Safari with client-zip and conflux, the two libraries that rely on WHATWG Streams and (in my case) async iterables, whereas it shows better runtimes with fflate (slightly) and JSZip (by a lot, though it may be a fluke as I did not repeat the 2-minutes long experiment), both of which use synchronous code with callbacks. +Also of note, using the SIMD-enabled CRC32 implementation in Chrome did not improve the overall performance of client-zip, suggesting that Chrome creates a bottleneck somewhere else. -Finally, I tried to run the experiment with 12k small files in Chrome, but it didn't finish after a few minutes so I gave up. Perhaps something to do with an inefficient handling of HTTP requests (I did disable network logging and enable network cache, but saw no impovement). +Finally, I tried to run the experiment with 12k small files in Chrome, but it was extremely slow. Perhaps something to do with an inefficient handling of so many HTTP requests (I did disable network logging and enable cache, but saw no impovement). Memory usage for any amount of data (when streaming using a ServiceWorker, or, in my test case for Zip64, deno) will remain constant or close enough. My tests maxed out at 36.1 MB of RAM while processing nearly 6 GB. Now, comparing bundle size is clearly unfair because the others do a bunch of things that my library doesn't. Here you go anyway (sizes are shown in decimal kilobytes): -| | `client-zip`@2.0.0 | fflate@0.7.1 | conflux@3 | JSZip@3.6 | +| | `client-zip`@2.1.0 | fflate@0.7.1 | conflux@3 | JSZip@3.6 | |--------------------|-------------------:|-------------:|----------:|----------:| -| minified | 4.6 kB | 29 kB | 185 kB | 96 kB | -| minified + gzipped | 2.1 kB | 11 kB | 53 kB | 27 kB | +| minified | 5.0 kB | 29 kB | 185 kB | 96 kB | +| minified + gzipped | 2.5 kB | 11 kB | 53 kB | 27 kB | The datasets I used in the new tests are not public domain, but nothing sensitive either ; I can send them if you ask. diff --git a/crc32x4.wasm b/crc32x4.wasm new file mode 100644 index 0000000000000000000000000000000000000000..47b307b9ac31cbc54f3275f61dd569907a99a900 GIT binary patch literal 338 zcmXX>F;2ul5c7CfB9H@#4(XarDjHhqY=y2mo^XgRQVPgH15#4(5o$ibEBMhWcmjpB zGhUBpjE8YB0CvU$aNsoAJ1eun27)0Fc5+HPO7SSsC%LyUg)XCE=n|7XUlT3zVo#Yz z)d{(J9UCO(x$0Y2B;1Nue|2|`I$4f=Yx?!B{xNd0H?h;$h@E32N=Rwv4-}NtD|3jDn literal 0 HcmV?d00001 diff --git a/index.d.ts b/index.d.ts index 25a89ff..ed10c67 100644 --- a/index.d.ts +++ b/index.d.ts @@ -11,3 +11,7 @@ type InputWithoutMeta = { input: BufferLike | StreamLike, name: any, lastModifie type ForAwaitable = AsyncIterable | Iterable export declare function downloadZip(files: ForAwaitable): Response + +/** Load the SIMD-enabled CRC32 module for improved performance. + * @param url the location of the crc32x4.wasm file ; by default, it should be next to the client-zip index */ +export declare function useSimd(url?: string | URL): Promise diff --git a/src/crc32.ts b/src/crc32.ts index c8a201a..cf3f600 100644 --- a/src/crc32.ts +++ b/src/crc32.ts @@ -1,12 +1,9 @@ -import { makeUint8Array } from "./utils.ts" +import { makeUint8Array, parseBase64 } from "./utils.ts" -const wasm = "AGFzbQEAAAABCgJgAABgAn9/AXwDAwIAAQUDAQACBw0DAW0CAAF0AAABYwABCusCApgBAQN/A0AgASEAQQAhAgNAIABBAXYgAEEBcUGghuLtfmxzIQAgAkEBaiICQQhHDQALIAFBAnQgADYCACABQQFqIgFBgAJHDQALQQAhAQNAQQAhAgNAIAIgAXIoAgAiAEH/AXFBAnQoAgAgAEEIdnMhACACQYAIaiICIAFyIAA2AgAgAkGAOEcNAAsgAUEEaiIBQYAIRw0ACwvOAQICfwF7IAFBf3MhAUGAgAQhAkGAgAQgAGoiAEECdUECdCIDQYSABE8EQANAIAEgAigCAHP9Ef0MA////wL///8B////AP////0OQQL9qwH9DAAAAAAABAAAAAgAAAAMAAD9UCIE/RsAKAIAIAT9GwEoAgBzIAT9GwIoAgBzIAT9GwMoAgBzIQEgAkEEaiICIANHDQALCyACIABJBEADQCABQf8BcSACLQAAc0ECdCgCACABQQh2cyEBIAJBAWoiAiAASQ0ACwsgAUF/c7gL" - -const instance = new WebAssembly.Instance( - new WebAssembly.Module(Uint8Array.from(atob(wasm), c => c.charCodeAt(0))) -) -const { t, c, m } = instance.exports as { t(): void, c(length: number, init: number): number, m: WebAssembly.Memory } -t() // initialize the table of precomputed CRCs ; this takes 8 kB in the second page of Memory +const wasm = "AGFzbQEAAAABCgJgAABgAn9/AXwDAwIAAQUDAQACBw0DAW0CAAF0AAABYwABCpUBAkkBA38DQCABIQBBACECA0AgAEEBdiAAQQFxQaCG4u1+bHMhACACQQFqIgJBCEcNAAsgAUECdCAANgIAIAFBAWoiAUGAAkcNAAsLSQEBfyABQX9zIQFBgIAEIQJBgIAEIABqIQADQCABQf8BcSACLQAAc0ECdCgCACABQQh2cyEBIAJBAWoiAiAASQ0ACyABQX9zuAs" +const instance = new WebAssembly.Instance(new WebAssembly.Module(parseBase64(wasm))) +let { t, c, m } = instance.exports as { t(): void, c(length: number, init: number): number, m: WebAssembly.Memory } +t() // initialize the table of precomputed CRCs ; this takes up to 4 kB in the first page of Memory export const memory = m // for testing // Someday we'll have BYOB stream readers and encodeInto etc. @@ -14,6 +11,18 @@ export const memory = m // for testing const pageSize = 0x10000 // 64 kB const crcBuffer = makeUint8Array(m).subarray(pageSize) +/** Load the SIMD-enabled CRC32 module for improved performance. + * @param url the location of the crc32x4.wasm file ; by default, it should be next to the client-zip index */ +export async function useSimd(url: string | URL = new URL("crc32x4.wasm", import.meta.url)) { + if (WebAssembly.validate(parseBase64("AGFzbQEAAAABBQFgAAF7AwIBAAoKAQgAQQD9D/1iCw"))) { + const res = await fetch(url) + if (!res.ok) throw new Error(`HTTP error ${res.status}.`) + const source = await WebAssembly.instantiate(await res.arrayBuffer(), { m: { m } }) as WebAssembly.WebAssemblyInstantiatedSource + ({ t, c } = source.instance.exports as { t(): void, c(length: number, init: number): number }) + t() + } +} + export function crc32(data: Uint8Array, crc = 0) { for (const part of splitBuffer(data)) { crcBuffer.set(part) diff --git a/src/crc32.wat b/src/crc32.wat index 57b7ad1..d2a23fa 100644 --- a/src/crc32.wat +++ b/src/crc32.wat @@ -20,75 +20,13 @@ (local.tee $i (i32.add (local.get $i) (i32.const 1))) (br_if 0 (i32.ne (i32.const 0x100))) ) - (local.set $i (i32.const 0)) - (loop ;; now compute 3 more tables for slice-by-4 - (local.set $j (i32.const 0)) - (loop - (i32.load (i32.or (local.get $j) (local.get $i))) - local.tee $crc - - (i32.and (i32.const 0xFF)) - (i32.shl (i32.const 2)) - i32.load - - (i32.shr_u (local.get $crc) (i32.const 8)) - (local.set $crc (i32.xor)) - - (local.tee $j (i32.add (local.get $j) (i32.const 0x400))) ;; j++ - (i32.or (local.get $i)) - - local.get $crc - i32.store - (br_if 0 (i32.ne (local.get $j) (i32.const 0xC00))) - ) - (local.tee $i (i32.add (local.get $i) (i32.const 4))) - (br_if 0 (i32.ne (i32.const 0x400))) - ) ;; in total, the tables occupy the first 4 kB of the first mem page ) ;; this computes the CRC32 of what you put in the module's second page of Memory ;; (do not overwrite the first page!) - (func $crc32 (param $len i32) (param $crc i32) (result f64) (local $i i32) (local $l8 i32) (local $v v128) + (func $crc32 (param $len i32) (param $crc i32) (result f64) (local $i i32) (local.set $crc (i32.xor (local.get $crc) (i32.const -1))) (local.set $i (i32.const 0x10000)) - (local.tee $len (i32.add (i32.const 0x10000) (local.get $len))) - i32.const 2 - i32.shr_s - i32.const 2 - i32.shl - local.tee $l8 - - (if (i32.ge_u (i32.const 0x10004)) - (loop - (i32.xor (local.get $crc) (i32.load (local.get $i))) - - i32x4.splat - v128.const i32x4 0xFFFFFF03 0xFFFFFF02 0xFFFFFF01 0xFFFFFF00 - i8x16.swizzle ;; this was called 'v8x16.swizzle' if you have an older WABT - - (i32x4.shl (i32.const 2)) - v128.const i32x4 0x00000000 0x00000400 0x00000800 0x00000C00 - v128.or - local.tee $v - (i32.load (i32x4.extract_lane 0)) - - local.get $v - (i32.load (i32x4.extract_lane 1)) - i32.xor - - local.get $v - (i32.load (i32x4.extract_lane 2)) - i32.xor - - local.get $v - (i32.load (i32x4.extract_lane 3)) - i32.xor - - local.set $crc ;; store the updated CRC - (local.tee $i (i32.add (local.get $i) (i32.const 4))) - (br_if 0 (i32.ne (local.get $l8))) - )) - - (if (i32.lt_u (local.get $i) (local.get $len)) + (local.set $len (i32.add (i32.const 0x10000) (local.get $len))) (loop (i32.and (local.get $crc) (i32.const 0xFF)) (i32.load8_u (local.get $i)) @@ -100,7 +38,7 @@ local.set $crc ;; store the updated CRC (local.tee $i (i32.add (local.get $i) (i32.const 1))) (br_if 0 (i32.lt_u (local.get $len))) - )) + ) (i32.xor (local.get $crc) (i32.const -1)) f64.convert_i32_u ;; return a positive Number ) diff --git a/src/crc32x4.wat b/src/crc32x4.wat new file mode 100644 index 0000000..81f8fa0 --- /dev/null +++ b/src/crc32x4.wat @@ -0,0 +1,93 @@ +;; inspired by https://create.stephan-brumme.com/crc32/ +(module + ;; reusing the memory from the basic (non-SIMD) crc32 instance, which should already have 1kB of precomputed CRCs + (memory (import "m" "m") 2) + ;; this function should be called once to initialize 3 more precomputed CRC tables for slice-by-4 + (func $genTable (local $crc i32) (local $i i32) (local $j i32) + (loop + (local.set $j (i32.const 0)) + (loop + (i32.load (i32.or (local.get $j) (local.get $i))) + local.tee $crc + + (i32.and (i32.const 0xFF)) + (i32.shl (i32.const 2)) + i32.load + + (i32.shr_u (local.get $crc) (i32.const 8)) + (local.set $crc (i32.xor)) + + (local.tee $j (i32.add (local.get $j) (i32.const 0x400))) + (i32.or (local.get $i)) + + local.get $crc + i32.store + (br_if 0 (i32.ne (local.get $j) (i32.const 0xC00))) + ) + (local.tee $i (i32.add (local.get $i) (i32.const 4))) + (br_if 0 (i32.ne (i32.const 0x400))) + ) ;; in total, the tables occupy the first 4 kB of the first mem page + ) + ;; this computes the CRC32 of what you put in the module's second page of Memory + ;; (do not overwrite the first page!) + (func $crc32 (param $len i32) (param $crc i32) (result f64) (local $i i32) (local $l8 i32) (local $v v128) + (local.set $crc (i32.xor (local.get $crc) (i32.const -1))) + (local.set $i (i32.const 0x10000)) + (local.tee $len (i32.add (i32.const 0x10000) (local.get $len))) + i32.const 2 + i32.shr_s + i32.const 2 + i32.shl + local.tee $l8 + + (if (i32.ge_u (i32.const 0x10004)) + (loop + (i32.xor (local.get $crc) (i32.load (local.get $i))) + + i32x4.splat + v128.const i32x4 0xFFFFFF03 0xFFFFFF02 0xFFFFFF01 0xFFFFFF00 + i8x16.swizzle ;; this was called 'v8x16.swizzle' if you have an older WABT + + (i32x4.shl (i32.const 2)) + v128.const i32x4 0x00000000 0x00000400 0x00000800 0x00000C00 + v128.or + local.tee $v + (i32.load (i32x4.extract_lane 0)) + + local.get $v + (i32.load (i32x4.extract_lane 1)) + i32.xor + + local.get $v + (i32.load (i32x4.extract_lane 2)) + i32.xor + + local.get $v + (i32.load (i32x4.extract_lane 3)) + i32.xor + + local.set $crc ;; store the updated CRC + (local.tee $i (i32.add (local.get $i) (i32.const 4))) + (br_if 0 (i32.ne (local.get $l8))) + )) + + (if (i32.lt_u (local.get $i) (local.get $len)) + ;; basic Sarwate algorithm for the last remaining bytes + (loop + (i32.and (local.get $crc) (i32.const 0xFF)) + (i32.load8_u (local.get $i)) + i32.xor + (i32.shl (i32.const 2)) + i32.load + (i32.shr_u (local.get $crc) (i32.const 8)) + i32.xor + local.set $crc + (local.tee $i (i32.add (local.get $i) (i32.const 1))) + (br_if 0 (i32.lt_u (local.get $len))) + )) + (i32.xor (local.get $crc) (i32.const -1)) + f64.convert_i32_u ;; return a positive Number + ) + (export "t" (func $genTable)) + (export "c" (func $crc32)) +) diff --git a/src/index.ts b/src/index.ts index 4044e78..be88e3c 100644 --- a/src/index.ts +++ b/src/index.ts @@ -1,6 +1,7 @@ import "./polyfills.ts" import { BufferLike, StreamLike, normalizeInput, ReadableFromIter } from "./input.ts" import { loadFiles, ForAwaitable } from "./zip.ts" +export { useSimd } from "./crc32.ts" /** The file name and modification date will be read from the input; * extra arguments can be given to override the input's metadata. */ diff --git a/src/utils.ts b/src/utils.ts index 43aa7ed..1b35a44 100644 --- a/src/utils.ts +++ b/src/utils.ts @@ -2,3 +2,4 @@ export const makeBuffer = (size: number) => new DataView(new ArrayBuffer(size)) export const makeUint8Array = (thing: any) => new Uint8Array(thing.buffer || thing) export const clampInt32 = (n: bigint) => Math.min(0xffffffff, Number(n)) export const clampInt16 = (n: bigint) => Math.min(0xffff, Number(n)) +export const parseBase64 = (str: string) => Uint8Array.from(atob(str), c => c.charCodeAt(0)) diff --git a/src/worker.ts b/src/worker.ts index 0906eec..26ecd7f 100644 --- a/src/worker.ts +++ b/src/worker.ts @@ -1,2 +1,3 @@ -import { downloadZip } from "./index.ts" +import { downloadZip, useSimd } from "./index.ts" +useSimd().catch(console.error) export default downloadZip diff --git a/test/crc32.test.ts b/test/crc32.test.ts index 3b805c5..a712871 100644 --- a/test/crc32.test.ts +++ b/test/crc32.test.ts @@ -1,11 +1,18 @@ import { assertEquals } from "https://deno.land/std/testing/asserts.ts" -import { crc32, memory } from "../src/crc32.ts" +import { crc32, memory, useSimd } from "../src/crc32.ts" const table = await Deno.readFile("./test/table.array") Deno.test("the CRC32 module precomputes CRCs for each byte using the polynomial 0xEDB88320", () => { - const actual = new Uint8Array(memory.buffer).subarray(0, 0x1000) - const expected = table.subarray(0, 0x1000) + const actual = new Uint8Array(memory.buffer).subarray(0, 0x400) + const expected = table.subarray(0, 0x400) + assertEquals(actual, expected) +}) + +Deno.test("the CRC32x4 module precomputes 3 more rows of CRCs", async () => { + await withFakeFetch(useSimd, Deno.readFile("./crc32x4.wasm")) + const actual = new Uint8Array(memory.buffer).subarray(0x400, 0x1000) + const expected = table.subarray(0x400, 0x1000) assertEquals(actual, expected) }) @@ -23,3 +30,12 @@ Deno.test("the CRC32 for files larger than 64kB", () => { const zipSpec = Deno.readFileSync("./test/APPNOTE.TXT") assertEquals(crc32(new Uint8Array(zipSpec), 0), 0xbb3afe3f) }) + +async function withFakeFetch(f: () => Promise, data: Promise) { + const realFetch = globalThis.fetch + // @ts-ignore (mock fetch so crc32 module can get a wasm without HTTP) + globalThis.fetch = (_: string) => Promise.resolve({ arrayBuffer: () => data, ok: true }) + await f() + // we can restore fetch now + globalThis.fetch = realFetch +}