art-w / sherlodoc

Fuzzy type search for OCaml documentation
MIT License
69 stars 4 forks source link

Stackoverflow during indexing #3

Open Khady opened 1 year ago

Khady commented 1 year ago

I was running the indexer on an internal project for which I can't share the code. There is on odocl file that is not super big, only 68k, but triggers a stack overflow in a List.map call. Given the appearance of Cache.Make.memo later in the stacktrace, I suppose that it is from this snippet.

https://github.com/art-w/sherlodoc/blob/8b9e648700edb3fbdc3b74bd8610dcae988d86cc/index/load_doc.ml#L83-L96

Replacing List.map with ExtLib.List.map (which is tailrec) fixes the issue.

Khady commented 1 year ago

My quickfix for reference

diff --git a/index/dune b/index/dune
index e75cb53..795cf51 100644
--- a/index/dune
+++ b/index/dune
@@ -1,6 +1,7 @@
 (executable
  (name index)
  (libraries
+  extlib
   db
   fpath
   tyxml
diff --git a/index/index.ml b/index/index.ml
index 188a686..2113679 100644
--- a/index/index.ml
+++ b/index/index.ml
@@ -9,7 +9,7 @@ let of_filename f =
   in
   module_name, f

-let filenames () = List.map of_filename (Files.list odoc_directory)
+let filenames () = ExtLib.List.map of_filename (Files.list odoc_directory)

 let () =
   let files = filenames () in
diff --git a/index/load_doc.ml b/index/load_doc.ml
index e0c5c3f..2b687ff 100644
--- a/index/load_doc.ml
+++ b/index/load_doc.ml
@@ -82,7 +82,7 @@ let rec paths ~prefix ~sgn = function
         | [] -> [ prefix ]
         | _ ->
             rev_concat
-            @@ List.mapi
+            @@ ExtLib.List.mapi
                  (fun i arg ->
                    let prefix = Cache_name.memo (string_of_int i) :: prefix in
                    paths ~prefix ~sgn arg)
@@ -90,7 +90,7 @@ let rec paths ~prefix ~sgn = function
       end
   | Tuple args ->
       rev_concat
-      @@ List.mapi (fun i arg ->
+      @@ ExtLib.List.mapi (fun i arg ->
              let prefix = Cache_name.memo (string_of_int i ^ "*") :: prefix in
              paths ~prefix ~sgn arg)
       @@ args
@@ -106,7 +106,7 @@ let rec type_paths ~prefix ~sgn = function
         (type_paths ~prefix ~sgn b)
   | Constr (name, args) ->
       rev_concat
-      @@ List.map (fun name ->
+      @@ ExtLib.List.map (fun name ->
              let name = String.concat "." name in
              let prefix = name :: Types.string_of_sgn sgn :: prefix in
              begin
@@ -114,14 +114,14 @@ let rec type_paths ~prefix ~sgn = function
                | [] -> [ prefix ]
                | _ ->
                    rev_concat
-                   @@ List.mapi
+                   @@ ExtLib.List.mapi
                         (fun i arg ->
                           let prefix = string_of_int i :: prefix in
                           type_paths ~prefix ~sgn arg)
                         args
              end)
       @@ all_type_names name
-  | Tuple args -> rev_concat @@ List.map (type_paths ~prefix ~sgn) @@ args
+  | Tuple args -> rev_concat @@ ExtLib.List.map (type_paths ~prefix ~sgn) @@ args
   | _ -> []

 let save_item ~pkg ~path_list ~path name type_ doc =
@@ -162,10 +162,10 @@ let save_item ~pkg ~path_list ~path name type_ doc =
       (Db.list_of_string (Odoc_model.Names.ValueName.to_string name))
       ('.' :: path_list)
   in
-  let my_full_name = List.map Char.lowercase_ascii my_full_name in
+  let my_full_name = ExtLib.List.map Char.lowercase_ascii my_full_name in
   Db.store_name my_full_name str_type ;
   let type_paths = type_paths ~prefix:[] ~sgn:Pos type_ in
-  Db.store_all str_type (List.map (List.map Cache_name.memo) type_paths)
+  Db.store_all str_type (ExtLib.List.map (ExtLib.List.map Cache_name.memo) type_paths)

 let rec item ~pkg ~path_list ~path =
   let open Odoc_model.Lang in
@@ -233,7 +233,7 @@ let run ~odoc_directory (root_name, filename) =
     | _ ->
         invalid_arg (Printf.sprintf "not a valid package/version? %S" filename)
   in
-  Format.printf "%s %s => %s@." package version root_name ;
+  Printf.printf "%s %s => %s\n%!" package version root_name ;
   let filename = Filename.concat odoc_directory filename in
   let fpath = Result.get_ok @@ Fpath.of_string filename in
   let t =
diff --git a/index/pretty.ml b/index/pretty.ml
index 1435357..e20772c 100644
--- a/index/pretty.ml
+++ b/index/pretty.ml
@@ -54,11 +54,11 @@ and string_of_element = function

 and string_of_link_content lst =
   H.span
-    (List.map (fun r -> string_of_non_link r.Odoc_model.Location_.value) lst)
+    (ExtLib.List.map (fun r -> string_of_non_link r.Odoc_model.Location_.value) lst)

 and string_of_paragraph lst =
   H.span
-    (List.map (fun elt -> string_of_element elt.Odoc_model.Location_.value) lst)
+    (ExtLib.List.map (fun elt -> string_of_element elt.Odoc_model.Location_.value) lst)

 let string_of_doc = function
   | `Paragraph p -> Some (H.p [ string_of_paragraph p ])
diff --git a/query/dune b/query/dune
index 0b3ee71..f2af0f7 100644
--- a/query/dune
+++ b/query/dune
@@ -1,6 +1,6 @@
 (library
  (name query)
- (libraries lwt re db))
+ (libraries extlib lwt re db))

 (menhir
  (modules parser)
diff --git a/query/query.ml b/query/query.ml
index caefcda..5a4885f 100644
--- a/query/query.ml
+++ b/query/query.ml
@@ -62,7 +62,7 @@ let find_inter ~shards names =
       let db = shard.Storage.db in
       let r =
         sort @@ inter_list
-        @@ List.map
+        @@ ExtLib.List.map
              (fun (name, count) ->
                collapse_count ~count
                @@ collapse_trie_with_poly name
@@ -76,7 +76,7 @@ let find_inter ~shards names =

 let find_names ~shards names =
   let names =
-    List.map
+    ExtLib.List.map
       (fun n -> List.rev (Db.list_of_string (String.lowercase_ascii n)))
       names
   in
@@ -86,7 +86,7 @@ let find_names ~shards names =
       let open Lwt.Syntax in
       let+ () = Lwt.pause () in
       let candidates =
-        List.map
+        ExtLib.List.map
           (fun name ->
             let t = Tchar.find name db_names in
             collapse_triechar t)
diff --git a/query/query_ast.ml b/query/query_ast.ml
index a53ca4d..9a747f1 100644
--- a/query/query_ast.ml
+++ b/query/query_ast.ml
@@ -22,7 +22,7 @@ let rec paths_arrow ~prefix ~sgn = function
         | [] -> [ prefix ]
         | _ ->
             List.concat
-            @@ List.mapi
+            @@ ExtLib.List.mapi
                  (fun i arg ->
                    let prefix = string_of_int i :: prefix in
                    paths_arrow ~prefix ~sgn arg)
@@ -30,7 +30,7 @@ let rec paths_arrow ~prefix ~sgn = function
       end
   | Tuple args ->
       List.concat
-      @@ List.mapi
+      @@ ExtLib.List.mapi
            (fun i arg ->
              let prefix = (string_of_int i ^ "*") :: prefix in
              paths_arrow ~prefix ~sgn arg)
@@ -48,14 +48,14 @@ let rec paths ~prefix ~sgn = function
         | [] -> [ prefix ]
         | _ ->
             List.concat
-            @@ List.mapi
+            @@ ExtLib.List.mapi
                  (fun i arg ->
                    let prefix = string_of_int i :: prefix in
                    paths ~prefix ~sgn arg)
                  args
       end
   | Tuple args ->
-      List.concat @@ List.map (fun arg -> paths ~prefix ~sgn arg) args
+      List.concat @@ ExtLib.List.map (fun arg -> paths ~prefix ~sgn arg) args

 let rec show = function
   | Arrow (a, b) -> show_parens a ^ " -> " ^ show b
diff --git a/query/sort.ml b/query/sort.ml
index 2256d81..e86d2b0 100644
--- a/query/sort.ml
+++ b/query/sort.ml
@@ -70,7 +70,7 @@ let minimize = function
       let used = Array.make (List.length (List.hd arr)) false in
       let arr =
         Array.map (fun lst ->
-            let lst = (1, None) :: List.mapi (fun i x -> x, Some i) lst in
+            let lst = (1, None) :: ExtLib.List.mapi (fun i x -> x, Some i) lst in
             List.sort Stdlib.compare lst)
         @@ Array.of_list arr
       in
@@ -132,17 +132,17 @@ let score_type query_type paths =
   | _, [] | [], _ -> 0
   | _ ->
       let arr =
-        List.map
+        ExtLib.List.map
           (fun p ->
             let p = List.rev p in
-            List.map (fun q -> distance (List.rev q) p) query_type)
+            ExtLib.List.map (fun q -> distance (List.rev q) p) query_type)
           paths
       in
       minimize arr

 let list query_name query_type results =
   let results =
-    List.map
+    ExtLib.List.map
       (fun a ->
         let open Elt in
         let name_cost = score_name query_name a.name in
diff --git a/www/packages.ml b/www/packages.ml
index 8b2b97c..f2b071a 100644
--- a/www/packages.ml
+++ b/www/packages.ml
@@ -145,14 +145,14 @@ let html =
   div
     ~a:[ a_class [ "categories" ] ]
     (M.bindings packages
-    |> List.map (fun (category, packages) ->
+    |> ExtLib.List.map (fun (category, packages) ->
            div
              ~a:[ a_class [ "category" ] ]
              [ h3 [ txt (if category = "" then "Not classified" else category) ]
              ; div
                  ~a:[ a_class [ "packages" ] ]
                  (S.elements packages
-                 |> List.map (fun package ->
+                 |> ExtLib.List.map (fun package ->
                         a
                           ~a:
                             [ a_href ("https://ocaml.org/p/" ^ package.name)
diff --git a/www/ui.ml b/www/ui.ml
index 4e6cbd4..c799e55 100644
--- a/www/ui.ml
+++ b/www/ui.ml
@@ -33,7 +33,7 @@ let render ~pretty results =
             ~a:[ a_class [ "query" ] ]
             [ txt "Results for "; code [ txt pretty ] ]
         ; ul ~a:[ a_class [ "found" ] ]
-          @@ List.map (fun r -> li (render_result r)) results
+          @@ ExtLib.List.map (fun r -> li (render_result r)) results
         ]

 let ajax_reload =
diff --git a/www/www.ml b/www/www.ml
index 37972a8..8e14bc5 100644
--- a/www/www.ml
+++ b/www/www.ml
@@ -37,7 +37,7 @@ let api query =

 open Lwt.Syntax

-let get_query params = Option.value ~default:"" (Dream.query params "q")
+let get_query params = Stdlib.Option.value ~default:"" (Dream.query params "q")

 let root ~query fn _params =
   let* result = fn query in
art-w commented 1 year ago

Thanks a lot for debugging the issue and the fix! <3 I'm afraid there's a large function type in your lib that might cause slowdown/crashes on the website later, I'll try to reproduce soon :) (hopefully next week, I'm currently going through a tendinitis and must resist the urge to code)

... But I'm too happy that you tried to run it, and very curious: did you manage to start the website or are there other traps on the way? (PRs are welcome if you keep hacking! but also feature requests / complaints ;) )

Khady commented 1 year ago

I managed to start the website and the search is working nicely. There was no particular problem, the readme was enough. it is only internal code so far, not including the opam libs we depend on, 5.1G db to give you an idea of the scale. There is an instance now deployed internally, hopefully some colleagues will play with it next week.

thanks for the cool project. Will report more and try to contribute as usage increases.

art-w commented 1 year ago

Cool, good to know it worked!

art-w commented 6 months ago

Taking a look at this again, I believe the culprit was the outer map in the following line, since the list type_paths could grow exponentially on nested types: (while all the other lists should be reasonably small...)

Db.store_all str_type (List.map (List.map Cache_name.memo) type_paths)

The good news is that we don't have this exponential explosion anymore! But if I'm wrong, the bad news is that we delegate more work to odoc.search now, so it might be harder to fix depending on what's causing the issue. If and when you update sherlodoc, could you let us know if you still encounter the problem? :)

(At least I could not reproduce while indexing the documentation of opam packages that can be found on ocaml.org)

The command-line has changed a bit:

# using your odig generated documentation for the current switch:
$ odig odoc
$ sherlodoc index --db=db.ancient $(find $OPAM_SWITCH_PREFIX/var/cache/odig/odoc -name '*.odocl')

# or if you would like links to ocaml.org, you'll need to prepare an `odocls.tsv` file
# with three tab-separated columns: package_name\tversion\t/path/to/odocl
$ cat odocls.tsv
ocaml   5.1.0   /home/arthur/.opam/5.1.0/var/cache/odig/odoc/ocaml/stdlib__Buffer.odocl
ocaml   5.1.0   /home/arthur/.opam/5.1.0/var/cache/odig/odoc/ocaml/stdlib__Gc.odocl
...
cmdliner    1.2.0   ../ocaml-org-sherlodoc/2024/p/cmdliner/1.2.0/epoch-6896ddd946045dabbf3db278cdd97651/linked/p/cmdliner/1.2.0/page-doc.odocl
cmdliner    1.2.0   ../ocaml-org-sherlodoc/2024/p/cmdliner/1.2.0/epoch-6896ddd946045dabbf3db278cdd97651/linked/p/cmdliner/1.2.0/1.2.0/lib/cmdliner/Cmdliner.odocl
...
$ sherlodoc index --db=db.ancient --file-list=odocls.tsv

(Since you have private libs, it would be cool if the links could point to your local odig documentation but ocaml.org is currently hard-coded hmhmm)