From 42156bac50bdcfb6c7dda9a50f8b7ab54f7cd306 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Martin=20P=C3=A9pin?= <kerl@wkerl.me>
Date: Wed, 17 Feb 2021 00:09:19 +0100
Subject: [PATCH 1/4] Avoid two unnecessary blits

---
 src/lib/array/timsort.ml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/lib/array/timsort.ml b/src/lib/array/timsort.ml
index 254edb0..ba27f71 100644
--- a/src/lib/array/timsort.ml
+++ b/src/lib/array/timsort.ml
@@ -71,7 +71,7 @@ let rec merge_lo
   assert (Array.length src0 >= ofs0 + len0);
   assert (Array.length src1 >= ofs1 + len1);
   if len0 = 0 then
-    Array.blit src1 ofs1 dest ofs len1
+    ()
   else if len1 = 0 then
     Array.blit src0 ofs0 dest ofs len0
   else if cmp src0.(ofs0) src1.(ofs1) <= 0 then begin
@@ -105,7 +105,7 @@ let rec merge_hi
   if len0 = 0 then
     Array.blit src1 ofs1 dest ofs len1
   else if len1 = 0 then
-    Array.blit src0 ofs0 dest ofs len0
+    ()
   else if cmp src0.(ofs0 + len0 - 1) src1.(ofs1 + len1 - 1) <= 0 then begin
     dest.(ofs + len0 + len1 - 1) <- src1.(ofs1 + len1 - 1);
     merge_hi

From c7271d0a4bc5a3e9f772f2da5c0ed7cb9c32683f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Martin=20P=C3=A9pin?= <kerl@wkerl.me>
Date: Wed, 17 Feb 2021 00:21:04 +0100
Subject: [PATCH 2/4] in merge_* reduce the number of array accesses

---
 src/lib/array/timsort.ml | 66 ++++++++++++++++++++++------------------
 1 file changed, 36 insertions(+), 30 deletions(-)

diff --git a/src/lib/array/timsort.ml b/src/lib/array/timsort.ml
index ba27f71..a81164d 100644
--- a/src/lib/array/timsort.ml
+++ b/src/lib/array/timsort.ml
@@ -74,21 +74,24 @@ let rec merge_lo
     ()
   else if len1 = 0 then
     Array.blit src0 ofs0 dest ofs len0
-  else if cmp src0.(ofs0) src1.(ofs1) <= 0 then begin
-    dest.(ofs) <- src0.(ofs0);
-    merge_lo
-      cmp
-      dest (ofs + 1)
-      src0 (ofs0 + 1) (len0 - 1)
-      src1 ofs1 len1
-  end else begin
-    dest.(ofs) <- src1.(ofs1);
-    merge_lo
-      cmp
-      dest (ofs + 1)
-      src0 ofs0 len0
-      src1 (ofs1 + 1) (len1 - 1)
-  end
+  else
+    let x0 = src0.(ofs0) in
+    let x1 = src1.(ofs1) in
+    if cmp x0 x1 <= 0 then begin
+      dest.(ofs) <- x0;
+      merge_lo
+        cmp
+        dest (ofs + 1)
+        src0 (ofs0 + 1) (len0 - 1)
+        src1 ofs1 len1
+    end else begin
+      dest.(ofs) <- x1;
+      merge_lo
+        cmp
+        dest (ofs + 1)
+        src0 ofs0 len0
+        src1 (ofs1 + 1) (len1 - 1)
+    end
 
 
 let rec merge_hi
@@ -106,21 +109,24 @@ let rec merge_hi
     Array.blit src1 ofs1 dest ofs len1
   else if len1 = 0 then
     ()
-  else if cmp src0.(ofs0 + len0 - 1) src1.(ofs1 + len1 - 1) <= 0 then begin
-    dest.(ofs + len0 + len1 - 1) <- src1.(ofs1 + len1 - 1);
-    merge_hi
-      cmp
-      dest ofs
-      src0 ofs0 len0
-      src1 ofs1 (len1 - 1)
-  end else begin
-    dest.(ofs + len0 + len1 - 1) <- src0.(ofs0 + len0 - 1);
-    merge_hi
-      cmp
-      dest ofs
-      src0 ofs0 (len0 - 1)
-      src1 ofs1 len1
-  end
+  else
+    let x0 = src0.(ofs0 + len0 - 1) in
+    let x1 = src1.(ofs1 + len1 - 1) in
+    if cmp x0 x1 <= 0 then begin
+      dest.(ofs + len0 + len1 - 1) <- x1;
+      merge_hi
+        cmp
+        dest ofs
+        src0 ofs0 len0
+        src1 ofs1 (len1 - 1)
+    end else begin
+      dest.(ofs + len0 + len1 - 1) <- x0;
+      merge_hi
+        cmp
+        dest ofs
+        src0 ofs0 (len0 - 1)
+        src1 ofs1 len1
+    end
 
 
 let merge ~buffer cmp t (ofs0, len0) (ofs1, len1) =

From 4388514be0421bb8a4fe2a6bc3a4f24bd549d5be Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Martin=20P=C3=A9pin?= <kerl@wkerl.me>
Date: Wed, 17 Feb 2021 00:55:13 +0100
Subject: [PATCH 3/4] merge_*: reuse memory access from previous calls

---
 src/lib/array/timsort.ml | 105 +++++++++++++++++++++++----------------
 1 file changed, 62 insertions(+), 43 deletions(-)

diff --git a/src/lib/array/timsort.ml b/src/lib/array/timsort.ml
index a81164d..150e357 100644
--- a/src/lib/array/timsort.ml
+++ b/src/lib/array/timsort.ml
@@ -64,69 +64,88 @@ let%test_module _ =
 let rec merge_lo
   cmp
   dest ofs
-  src0 ofs0 len0
-  src1 ofs1 len1
+  src0 ofs0 len0 x0
+  src1 ofs1 len1 x1
 =
   assert (Array.length dest >= ofs + len0 + len1);
   assert (Array.length src0 >= ofs0 + len0);
   assert (Array.length src1 >= ofs1 + len1);
-  if len0 = 0 then
-    ()
-  else if len1 = 0 then
-    Array.blit src0 ofs0 dest ofs len0
-  else
-    let x0 = src0.(ofs0) in
-    let x1 = src1.(ofs1) in
-    if cmp x0 x1 <= 0 then begin
-      dest.(ofs) <- x0;
+  assert (x0 = src0.(ofs0));
+  assert (x1 = src1.(ofs1));
+  assert (len0 > 0);
+  assert (len1 > 0);
+
+  (* This is used to optimise the case len0 = 1 below. *)
+  assert (dest == src1 && ofs + len0 = ofs1);
+
+  if cmp x0 x1 <= 0 then begin
+    dest.(ofs) <- x0;
+    if len0 = 1 then
+      (* We are done with the run stored in src0. Also note that there is no
+       * need to blit since (dest,ofs) = (src1,ofs1). *)
+      ()
+    else
       merge_lo
         cmp
         dest (ofs + 1)
-        src0 (ofs0 + 1) (len0 - 1)
-        src1 ofs1 len1
-    end else begin
-      dest.(ofs) <- x1;
+        src0 (ofs0 + 1) (len0 - 1) src0.(ofs0 + 1)
+        src1 ofs1 len1 x1
+  end else begin
+    dest.(ofs) <- x1;
+    if len1 = 1 then
+      (* We are done with the run stored in src1, blit the rest of the other run
+       * and exit. *)
+      Array.blit src0 ofs0 dest (ofs + 1) len0
+    else
       merge_lo
         cmp
         dest (ofs + 1)
-        src0 ofs0 len0
-        src1 (ofs1 + 1) (len1 - 1)
-    end
-
+        src0 ofs0 len0 x0
+        src1 (ofs1 + 1) (len1 - 1) src1.(ofs1 + 1)
+  end
 
 let rec merge_hi
   cmp
   dest ofs
-  src0 ofs0 len0
-  src1 ofs1 len1
+  src0 ofs0 len0 x0
+  src1 ofs1 len1 x1
 =
   assert (Array.length dest >= ofs + len0 + len1);
   assert (Array.length src0 >= ofs0 + len0);
   assert (Array.length src1 >= ofs1 + len1);
-  assert (len0 >= 0);
-  assert (len1 >= 0);
-  if len0 = 0 then
-    Array.blit src1 ofs1 dest ofs len1
-  else if len1 = 0 then
-    ()
-  else
-    let x0 = src0.(ofs0 + len0 - 1) in
-    let x1 = src1.(ofs1 + len1 - 1) in
-    if cmp x0 x1 <= 0 then begin
-      dest.(ofs + len0 + len1 - 1) <- x1;
+  assert (x0 = src0.(ofs0 + len0 - 1));
+  assert (x1 = src1.(ofs1 + len1 - 1));
+  assert (len0 > 0);
+  assert (len1 > 0);
+
+  (* This is used to optimise the case len1 = 1 below. *)
+  assert (dest == src0 && ofs = ofs0);
+
+  if cmp x0 x1 <= 0 then begin
+    dest.(ofs + len0 + len1 - 1) <- x1;
+    if len1 = 1 then
+      (* We are done with the run stored in src1. Also note that there is no
+       * need to blit since (dest,ofs) = (src0,ofs0). *)
+      ()
+    else
       merge_hi
         cmp
         dest ofs
-        src0 ofs0 len0
-        src1 ofs1 (len1 - 1)
-    end else begin
-      dest.(ofs + len0 + len1 - 1) <- x0;
+        src0 ofs0 len0 x0
+        src1 ofs1 (len1 - 1) src1.(ofs1 + len1 - 2)
+  end else begin
+    dest.(ofs + len0 + len1 - 1) <- x0;
+    if len0 = 1 then
+      (* We are done with the run stored in src0, blit the rest of the other run
+       * and exit. *)
+      Array.blit src1 ofs1 dest ofs len1
+    else
       merge_hi
         cmp
         dest ofs
-        src0 ofs0 (len0 - 1)
-        src1 ofs1 len1
-    end
+        src0 ofs0 (len0 - 1) src0.(ofs0 + len0 - 2)
+        src1 ofs1 len1 x1
+  end
 
 
 let merge ~buffer cmp t (ofs0, len0) (ofs1, len1) =
@@ -137,15 +156,15 @@ let merge ~buffer cmp t (ofs0, len0) (ofs1, len1) =
     merge_lo
       cmp
       t ofs0
-      buffer 0 len0
-      t ofs1 len1
+      buffer 0 len0 buffer.(0)
+      t ofs1 len1 t.(ofs1)
   end else begin
     Array.blit t ofs1 buffer 0 len1;
     merge_hi
       cmp
       t ofs0
-      t ofs0 len0
-      buffer 0 len1
+      t ofs0 len0 t.(ofs0 + len0 - 1)
+      buffer 0 len1 buffer.(len1 - 1)
   end;
   ofs0, len0 + len1
 

From 2649663b035170f3741ed13225a60eb83ea043e6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Martin=20P=C3=A9pin?= <kerl@wkerl.me>
Date: Wed, 17 Feb 2021 02:17:49 +0100
Subject: [PATCH 4/4] Do less arithmetic in merge_{hi,lo}

We achieve this by using a representation of the form (beginning index,
ending index) for the slices rather than (beginning index, length).
This mostly reduces the number of arithmetic operations in merge_hi,
while merge_lo remains more or less untouched.

This has a noticeable impact on the in the benchmark!
---
 src/lib/array/timsort.ml | 127 +++++++++++++++++++--------------------
 1 file changed, 63 insertions(+), 64 deletions(-)

diff --git a/src/lib/array/timsort.ml b/src/lib/array/timsort.ml
index 150e357..4d79015 100644
--- a/src/lib/array/timsort.ml
+++ b/src/lib/array/timsort.ml
@@ -63,88 +63,86 @@ let%test_module _ =
 
 let rec merge_lo
   cmp
-  dest ofs
-  src0 ofs0 len0 x0
-  src1 ofs1 len1 x1
+  dest beg
+  src0 beg0 end0 x0
+  src1 beg1 end1 x1
 =
-  assert (Array.length dest >= ofs + len0 + len1);
-  assert (Array.length src0 >= ofs0 + len0);
-  assert (Array.length src1 >= ofs1 + len1);
-  assert (x0 = src0.(ofs0));
-  assert (x1 = src1.(ofs1));
-  assert (len0 > 0);
-  assert (len1 > 0);
+  assert (Array.length dest >= beg + (end0 - beg0 + 1) + (end1 - beg1 + 1));
+  assert (Array.length src0 > end0);
+  assert (Array.length src1 > end1);
+  assert (x0 = src0.(beg0));
+  assert (x1 = src1.(beg1));
+  assert (end0 >= beg0);
+  assert (end1 >= beg1);
 
   (* This is used to optimise the case len0 = 1 below. *)
-  assert (dest == src1 && ofs + len0 = ofs1);
+  assert (dest == src1 && beg + (end0 - beg0 + 1) = beg1);
 
   if cmp x0 x1 <= 0 then begin
-    dest.(ofs) <- x0;
-    if len0 = 1 then
-      (* We are done with the run stored in src0. Also note that there is no
-       * need to blit since (dest,ofs) = (src1,ofs1). *)
-      ()
-    else
+    dest.(beg) <- x0;
+    (* If this was the last element of run0 we stop. Besides, since we have
+     * [dest == src1], the remaining element of run1 are already where they
+     * are supposed to be and we can exit. *)
+    if beg0 < end0 then
       merge_lo
         cmp
-        dest (ofs + 1)
-        src0 (ofs0 + 1) (len0 - 1) src0.(ofs0 + 1)
-        src1 ofs1 len1 x1
+        dest (beg + 1)
+        src0 (beg0 + 1) end0 src0.(beg0 + 1)
+        src1 beg1 end1 x1
   end else begin
-    dest.(ofs) <- x1;
-    if len1 = 1 then
-      (* We are done with the run stored in src1, blit the rest of the other run
-       * and exit. *)
-      Array.blit src0 ofs0 dest (ofs + 1) len0
-    else
+    dest.(beg) <- x1;
+    (* If this was the last element of run1 we stop. There remains to move all
+     * the elements of run0 at the beginning of dest. *)
+    if beg1 < end1 then
       merge_lo
         cmp
-        dest (ofs + 1)
-        src0 ofs0 len0 x0
-        src1 (ofs1 + 1) (len1 - 1) src1.(ofs1 + 1)
+        dest (beg + 1)
+        src0 beg0 end0 x0
+        src1 (beg1 + 1) end1 src1.(beg1 + 1)
+    else
+      Array.blit src0 beg0 dest (beg + 1) (end0 - beg0 + 1)
   end
 
 let rec merge_hi
   cmp
-  dest ofs
-  src0 ofs0 len0 x0
-  src1 ofs1 len1 x1
+  dest end_
+  src0 beg0 end0 x0 (* run0 *)
+  src1 beg1 end1 x1 (* run1 *)
 =
-  assert (Array.length dest >= ofs + len0 + len1);
-  assert (Array.length src0 >= ofs0 + len0);
-  assert (Array.length src1 >= ofs1 + len1);
-  assert (x0 = src0.(ofs0 + len0 - 1));
-  assert (x1 = src1.(ofs1 + len1 - 1));
-  assert (len0 > 0);
-  assert (len1 > 0);
-
+  assert (Array.length dest > end_);
+  assert (Array.length src0 > end0);
+  assert (Array.length src1 > end1);
+  assert (x0 = src0.(end0));
+  assert (x1 = src1.(end1));
+  assert (end0 >= beg0);
+  assert (end1 >= beg1);
   (* This is used to optimise the case len1 = 1 below. *)
-  assert (dest == src0 && ofs = ofs0);
+  assert (dest == src0 && end_ - (end1 - beg1 + 1) = end0);
 
   if cmp x0 x1 <= 0 then begin
-    dest.(ofs + len0 + len1 - 1) <- x1;
-    if len1 = 1 then
-      (* We are done with the run stored in src1. Also note that there is no
-       * need to blit since (dest,ofs) = (src0,ofs0). *)
-      ()
-    else
+    dest.(end_) <- x1;
+    (* If this was the last element of run1 we stop. Besides, since we have
+     * [dest == src0], the remaining element of run0 are already where they
+     * are supposed to be and we can exit. *)
+    if beg1 < end1 then
       merge_hi
         cmp
-        dest ofs
-        src0 ofs0 len0 x0
-        src1 ofs1 (len1 - 1) src1.(ofs1 + len1 - 2)
+        dest (end_ - 1)
+        src0 beg0 end0 x0
+        src1 beg1 (end1 - 1) src1.(end1 - 1)
   end else begin
-    dest.(ofs + len0 + len1 - 1) <- x0;
-    if len0 = 1 then
-      (* We are done with the run stored in src0, blit the rest of the other run
-       * and exit. *)
-      Array.blit src1 ofs1 dest ofs len1
-    else
+    dest.(end_) <- x0;
+    (* If this was the last element of run0 we stop. There remains to move all
+     * the elements of run1 at the end of dest. *)
+    if beg0 < end0 then
       merge_hi
         cmp
-        dest ofs
-        src0 ofs0 (len0 - 1) src0.(ofs0 + len0 - 2)
-        src1 ofs1 len1 x1
+        dest (end_ - 1)
+        src0 beg0 (end0 - 1) src0.(end0 - 1)
+        src1 beg1 end1 x1
+    else
+      let len1 = end1 - beg1 + 1 in
+      Array.blit src1 beg1 dest (end_ - len1) len1
   end
 
 
@@ -156,15 +154,16 @@ let merge ~buffer cmp t (ofs0, len0) (ofs1, len1) =
     merge_lo
       cmp
       t ofs0
-      buffer 0 len0 buffer.(0)
-      t ofs1 len1 t.(ofs1)
+      buffer 0 (len0 - 1) buffer.(0)
+      t ofs1 (ofs1 + len1 - 1) t.(ofs1)
   end else begin
     Array.blit t ofs1 buffer 0 len1;
+    let top0 = ofs0 + len0 - 1 in
     merge_hi
       cmp
-      t ofs0
-      t ofs0 len0 t.(ofs0 + len0 - 1)
-      buffer 0 len1 buffer.(len1 - 1)
+      t (top0 + len1)
+      t ofs0 top0 t.(top0)
+      buffer 0 (len1 - 1) buffer.(len1 - 1)
   end;
   ofs0, len0 + len1