diff --git a/README.md b/README.md index 0323232..14e170c 100644 --- a/README.md +++ b/README.md @@ -61,7 +61,7 @@ When necessary, client-zip will generate Zip64 archives. Those are not readable # Usage -The module exports (and the worker script globally defines) a single function: +The module exports (and the worker script globally defines) this function: ```typescript function downloadZip(files: ForAwaitable): Response ``` @@ -78,6 +78,21 @@ The function returns a `Response` immediately. You don't need to wait for the wh Unless your list of inputs is quite small, you should prefer generators (when zipping Files or other resources that are already available) and async generators (when zipping Responses so you can `fetch` them lazily, or other resources that are generated last-minute so you don't need to store them longer than necessary) to provide the inputs to `downloadZip`. +### Setting up vector processing for CRC32 + +There is a second export from the module: +```typescript +function useSimd(url?: string | URL): Promise +``` + +**Warning**: it may reject! You should catch errors, and log them in dev mode until you fix the URL or hosting configuration. + +When you call that function, it will feature test for SIMD instructions in WebAssembly, and if that succeeds, it will attempt to replace the CRC32 module with another one that's almost 30% faster (in Deno, at least), for all current and future uses of `downloadZip`. + +You may call `useSimd` at any point (but preferably early and only once). By default it will look for a file named "crc32x4.wasm" in the same location as `import.meta.url`, which works when loading the library from a CDN but not necessarily if you bundle the library code into your app or "vendor" script. That's why you can pass a URL argument to `useSimd` (it will fetch the WASM from there instead ; depending on your setup you might want to copy the file to some static storage, or point to a CDN). + +The IIFE worker script does *not* expose `useSimd` ; instead it calls it immediately with no argument. So if you host "worker.js", make sure "crc32x4.wasm" is served right next to it. + # Benchmarks I started this project because I wasn't impressed with what appeared to be the only other ZIP library for browsers, [JSZip](https://stuk.github.io/jszip/). The JSZip website acknowledges its performance limitations, but now we can actually quantify them. I later found other libraries, which I've included in the new benchmarks. @@ -96,20 +111,22 @@ The experiments were run about 10 times for each lib and each dataset with a few *For the baseline, I timed the `zip` process in my UNIX shell — clearly there is much room for improvement. -The files were served over HTTP/1.1 by nginx running on localhost, with cache enabled (not that it makes a difference). The overhead of HTTP (not network, just having to go through the layers) really shows in the dataset with 12k files. +The files were served over HTTP by nginx running on localhost, with cache enabled (not that it makes a difference). + +It's interesting that Chrome performs so much worse than Safari with client-zip and conflux, the two libraries that rely on WHATWG Streams and (in my case) async iterables, whereas it shows better runtimes with fflate (slightly) and JSZip (by a lot, though it may be a fluke as I did not repeat the experiment), both of which use synchronous code with callbacks. Shame on you, Chrome. -It's interesting that Chrome performs so much worse than Safari with client-zip and conflux, the two libraries that rely on WHATWG Streams and (in my case) async iterables, whereas it shows better runtimes with fflate (slightly) and JSZip (by a lot, though it may be a fluke as I did not repeat the 2-minutes long experiment), both of which use synchronous code with callbacks. +Also of note, using the SIMD-enabled CRC32 implementation in Chrome did not improve the overall performance of client-zip, suggesting that Chrome creates a bottleneck somewhere else. -Finally, I tried to run the experiment with 12k small files in Chrome, but it didn't finish after a few minutes so I gave up. Perhaps something to do with an inefficient handling of HTTP requests (I did disable network logging and enable network cache, but saw no impovement). +Finally, I tried to run the experiment with 12k small files in Chrome, but it was extremely slow. Perhaps something to do with an inefficient handling of so many HTTP requests (I did disable network logging and enable cache, but saw no impovement). Memory usage for any amount of data (when streaming using a ServiceWorker, or, in my test case for Zip64, deno) will remain constant or close enough. My tests maxed out at 36.1 MB of RAM while processing nearly 6 GB. Now, comparing bundle size is clearly unfair because the others do a bunch of things that my library doesn't. Here you go anyway (sizes are shown in decimal kilobytes): -| | `client-zip`@2.0.0 | fflate@0.7.1 | conflux@3 | JSZip@3.6 | +| | `client-zip`@2.1.0 | fflate@0.7.1 | conflux@3 | JSZip@3.6 | |--------------------|-------------------:|-------------:|----------:|----------:| -| minified | 4.6 kB | 29 kB | 185 kB | 96 kB | -| minified + gzipped | 2.1 kB | 11 kB | 53 kB | 27 kB | +| minified | 5.0 kB | 29 kB | 185 kB | 96 kB | +| minified + gzipped | 2.5 kB | 11 kB | 53 kB | 27 kB | The datasets I used in the new tests are not public domain, but nothing sensitive either ; I can send them if you ask. diff --git a/crc32x4.wasm b/crc32x4.wasm new file mode 100644 index 0000000..47b307b Binary files /dev/null and b/crc32x4.wasm differ diff --git a/index.d.ts b/index.d.ts index 25a89ff..ed10c67 100644 --- a/index.d.ts +++ b/index.d.ts @@ -11,3 +11,7 @@ type InputWithoutMeta = { input: BufferLike | StreamLike, name: any, lastModifie type ForAwaitable = AsyncIterable | Iterable export declare function downloadZip(files: ForAwaitable): Response + +/** Load the SIMD-enabled CRC32 module for improved performance. + * @param url the location of the crc32x4.wasm file ; by default, it should be next to the client-zip index */ +export declare function useSimd(url?: string | URL): Promise diff --git a/src/crc32.ts b/src/crc32.ts index 7bdb6bb..cf3f600 100644 --- a/src/crc32.ts +++ b/src/crc32.ts @@ -1,12 +1,9 @@ -import { makeUint8Array } from "./utils.ts" +import { makeUint8Array, parseBase64 } from "./utils.ts" const wasm = "AGFzbQEAAAABCgJgAABgAn9/AXwDAwIAAQUDAQACBw0DAW0CAAF0AAABYwABCpUBAkkBA38DQCABIQBBACECA0AgAEEBdiAAQQFxQaCG4u1+bHMhACACQQFqIgJBCEcNAAsgAUECdCAANgIAIAFBAWoiAUGAAkcNAAsLSQEBfyABQX9zIQFBgIAEIQJBgIAEIABqIQADQCABQf8BcSACLQAAc0ECdCgCACABQQh2cyEBIAJBAWoiAiAASQ0ACyABQX9zuAs" - -const instance = new WebAssembly.Instance( - new WebAssembly.Module(Uint8Array.from(atob(wasm), c => c.charCodeAt(0))) -) -const { t, c, m } = instance.exports as { t(): void, c(length: number, init: number): number, m: WebAssembly.Memory } -t() // initialize the table of precomputed CRCs ; this takes 8 kB in the second page of Memory +const instance = new WebAssembly.Instance(new WebAssembly.Module(parseBase64(wasm))) +let { t, c, m } = instance.exports as { t(): void, c(length: number, init: number): number, m: WebAssembly.Memory } +t() // initialize the table of precomputed CRCs ; this takes up to 4 kB in the first page of Memory export const memory = m // for testing // Someday we'll have BYOB stream readers and encodeInto etc. @@ -14,6 +11,18 @@ export const memory = m // for testing const pageSize = 0x10000 // 64 kB const crcBuffer = makeUint8Array(m).subarray(pageSize) +/** Load the SIMD-enabled CRC32 module for improved performance. + * @param url the location of the crc32x4.wasm file ; by default, it should be next to the client-zip index */ +export async function useSimd(url: string | URL = new URL("crc32x4.wasm", import.meta.url)) { + if (WebAssembly.validate(parseBase64("AGFzbQEAAAABBQFgAAF7AwIBAAoKAQgAQQD9D/1iCw"))) { + const res = await fetch(url) + if (!res.ok) throw new Error(`HTTP error ${res.status}.`) + const source = await WebAssembly.instantiate(await res.arrayBuffer(), { m: { m } }) as WebAssembly.WebAssemblyInstantiatedSource + ({ t, c } = source.instance.exports as { t(): void, c(length: number, init: number): number }) + t() + } +} + export function crc32(data: Uint8Array, crc = 0) { for (const part of splitBuffer(data)) { crcBuffer.set(part) diff --git a/src/crc32.wat b/src/crc32.wat index 78e7593..d2a23fa 100644 --- a/src/crc32.wat +++ b/src/crc32.wat @@ -44,4 +44,4 @@ ) (export "t" (func $genTable)) (export "c" (func $crc32)) -) \ No newline at end of file +) diff --git a/src/crc32x4.wat b/src/crc32x4.wat new file mode 100644 index 0000000..81f8fa0 --- /dev/null +++ b/src/crc32x4.wat @@ -0,0 +1,93 @@ +;; inspired by https://create.stephan-brumme.com/crc32/ +(module + ;; reusing the memory from the basic (non-SIMD) crc32 instance, which should already have 1kB of precomputed CRCs + (memory (import "m" "m") 2) + ;; this function should be called once to initialize 3 more precomputed CRC tables for slice-by-4 + (func $genTable (local $crc i32) (local $i i32) (local $j i32) + (loop + (local.set $j (i32.const 0)) + (loop + (i32.load (i32.or (local.get $j) (local.get $i))) + local.tee $crc + + (i32.and (i32.const 0xFF)) + (i32.shl (i32.const 2)) + i32.load + + (i32.shr_u (local.get $crc) (i32.const 8)) + (local.set $crc (i32.xor)) + + (local.tee $j (i32.add (local.get $j) (i32.const 0x400))) + (i32.or (local.get $i)) + + local.get $crc + i32.store + (br_if 0 (i32.ne (local.get $j) (i32.const 0xC00))) + ) + (local.tee $i (i32.add (local.get $i) (i32.const 4))) + (br_if 0 (i32.ne (i32.const 0x400))) + ) ;; in total, the tables occupy the first 4 kB of the first mem page + ) + ;; this computes the CRC32 of what you put in the module's second page of Memory + ;; (do not overwrite the first page!) + (func $crc32 (param $len i32) (param $crc i32) (result f64) (local $i i32) (local $l8 i32) (local $v v128) + (local.set $crc (i32.xor (local.get $crc) (i32.const -1))) + (local.set $i (i32.const 0x10000)) + (local.tee $len (i32.add (i32.const 0x10000) (local.get $len))) + i32.const 2 + i32.shr_s + i32.const 2 + i32.shl + local.tee $l8 + + (if (i32.ge_u (i32.const 0x10004)) + (loop + (i32.xor (local.get $crc) (i32.load (local.get $i))) + + i32x4.splat + v128.const i32x4 0xFFFFFF03 0xFFFFFF02 0xFFFFFF01 0xFFFFFF00 + i8x16.swizzle ;; this was called 'v8x16.swizzle' if you have an older WABT + + (i32x4.shl (i32.const 2)) + v128.const i32x4 0x00000000 0x00000400 0x00000800 0x00000C00 + v128.or + local.tee $v + (i32.load (i32x4.extract_lane 0)) + + local.get $v + (i32.load (i32x4.extract_lane 1)) + i32.xor + + local.get $v + (i32.load (i32x4.extract_lane 2)) + i32.xor + + local.get $v + (i32.load (i32x4.extract_lane 3)) + i32.xor + + local.set $crc ;; store the updated CRC + (local.tee $i (i32.add (local.get $i) (i32.const 4))) + (br_if 0 (i32.ne (local.get $l8))) + )) + + (if (i32.lt_u (local.get $i) (local.get $len)) + ;; basic Sarwate algorithm for the last remaining bytes + (loop + (i32.and (local.get $crc) (i32.const 0xFF)) + (i32.load8_u (local.get $i)) + i32.xor + (i32.shl (i32.const 2)) + i32.load + (i32.shr_u (local.get $crc) (i32.const 8)) + i32.xor + local.set $crc + (local.tee $i (i32.add (local.get $i) (i32.const 1))) + (br_if 0 (i32.lt_u (local.get $len))) + )) + (i32.xor (local.get $crc) (i32.const -1)) + f64.convert_i32_u ;; return a positive Number + ) + (export "t" (func $genTable)) + (export "c" (func $crc32)) +) diff --git a/src/index.ts b/src/index.ts index 4044e78..be88e3c 100644 --- a/src/index.ts +++ b/src/index.ts @@ -1,6 +1,7 @@ import "./polyfills.ts" import { BufferLike, StreamLike, normalizeInput, ReadableFromIter } from "./input.ts" import { loadFiles, ForAwaitable } from "./zip.ts" +export { useSimd } from "./crc32.ts" /** The file name and modification date will be read from the input; * extra arguments can be given to override the input's metadata. */ diff --git a/src/utils.ts b/src/utils.ts index 43aa7ed..1b35a44 100644 --- a/src/utils.ts +++ b/src/utils.ts @@ -2,3 +2,4 @@ export const makeBuffer = (size: number) => new DataView(new ArrayBuffer(size)) export const makeUint8Array = (thing: any) => new Uint8Array(thing.buffer || thing) export const clampInt32 = (n: bigint) => Math.min(0xffffffff, Number(n)) export const clampInt16 = (n: bigint) => Math.min(0xffff, Number(n)) +export const parseBase64 = (str: string) => Uint8Array.from(atob(str), c => c.charCodeAt(0)) diff --git a/src/worker.ts b/src/worker.ts index 0906eec..26ecd7f 100644 --- a/src/worker.ts +++ b/src/worker.ts @@ -1,2 +1,3 @@ -import { downloadZip } from "./index.ts" +import { downloadZip, useSimd } from "./index.ts" +useSimd().catch(console.error) export default downloadZip diff --git a/test/crc32.test.ts b/test/crc32.test.ts index d28c4e1..a712871 100644 --- a/test/crc32.test.ts +++ b/test/crc32.test.ts @@ -1,11 +1,18 @@ import { assertEquals } from "https://deno.land/std/testing/asserts.ts" -import { crc32, memory } from "../src/crc32.ts" +import { crc32, memory, useSimd } from "../src/crc32.ts" const table = await Deno.readFile("./test/table.array") Deno.test("the CRC32 module precomputes CRCs for each byte using the polynomial 0xEDB88320", () => { - const actual = new Uint8Array(memory.buffer.slice(0, 0x0400)) - const expected = table.slice(0, 0x400) + const actual = new Uint8Array(memory.buffer).subarray(0, 0x400) + const expected = table.subarray(0, 0x400) + assertEquals(actual, expected) +}) + +Deno.test("the CRC32x4 module precomputes 3 more rows of CRCs", async () => { + await withFakeFetch(useSimd, Deno.readFile("./crc32x4.wasm")) + const actual = new Uint8Array(memory.buffer).subarray(0x400, 0x1000) + const expected = table.subarray(0x400, 0x1000) assertEquals(actual, expected) }) @@ -23,3 +30,12 @@ Deno.test("the CRC32 for files larger than 64kB", () => { const zipSpec = Deno.readFileSync("./test/APPNOTE.TXT") assertEquals(crc32(new Uint8Array(zipSpec), 0), 0xbb3afe3f) }) + +async function withFakeFetch(f: () => Promise, data: Promise) { + const realFetch = globalThis.fetch + // @ts-ignore (mock fetch so crc32 module can get a wasm without HTTP) + globalThis.fetch = (_: string) => Promise.resolve({ arrayBuffer: () => data, ok: true }) + await f() + // we can restore fetch now + globalThis.fetch = realFetch +}