diff --git a/.JuliaFormatter.toml b/.JuliaFormatter.toml
index 16f5cb2b..0cd5f15f 100644
--- a/.JuliaFormatter.toml
+++ b/.JuliaFormatter.toml
@@ -1,3 +1,5 @@
 style = "sciml"
 annotate_untyped_fields_with_any = false
 format_markdown = true
+format_docstrings = true
+join_lines_based_on_source = false
diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml
index e6d309af..7e8d929a 100644
--- a/.github/workflows/CI.yml
+++ b/.github/workflows/CI.yml
@@ -13,17 +13,16 @@ concurrency:
   cancel-in-progress: ${{ startsWith(github.ref, 'refs/pull/') }}
 jobs:
   test:
-    runs-on: ubuntu-latest
+    runs-on: ${{ matrix.os }}
     strategy:
       fail-fast: false
       matrix:
-        group:
-          - Shooting
-          - MIRK
-          - Others
+        os:
+          - ubuntu-latest
+          - macos-latest
+          - windows-latest
         version:
           - '1'
-          - '~1.10.0-0'
     steps:
       - uses: actions/checkout@v4
       - uses: julia-actions/setup-julia@v1
@@ -42,7 +41,9 @@ jobs:
       - uses: julia-actions/julia-buildpkg@v1
       - uses: julia-actions/julia-runtest@v1
         env:
-          GROUP: ${{ matrix.group }}
+          GROUP: "CPU"
+          RETESTITEMS_NWORKERS: 4
+          RETESTITEMS_NWORKER_THREADS: 2
       - uses: julia-actions/julia-processcoverage@v1
       - uses: codecov/codecov-action@v4
         with:
diff --git a/.github/workflows/CompatHelper.yml b/.github/workflows/CompatHelper.yml
index 46670d75..0fe6c374 100644
--- a/.github/workflows/CompatHelper.yml
+++ b/.github/workflows/CompatHelper.yml
@@ -11,7 +11,7 @@ jobs:
     runs-on: ${{ matrix.os }}
     strategy:
       matrix:
-        julia-version: [1.5.0]
+        julia-version: [1]
         julia-arch: [x86]
         os: [ubuntu-latest]
     steps:
diff --git a/Project.toml b/Project.toml
index a1f4c4f5..85c9cf6d 100644
--- a/Project.toml
+++ b/Project.toml
@@ -1,6 +1,6 @@
 name = "BoundaryValueDiffEq"
 uuid = "764a87c0-6b3e-53db-9096-fe964310641d"
-version = "5.6.3"
+version = "5.7.0"
 
 [deps]
 ADTypes = "47edcb42-4c32-4615-8424-f2b9edc5f35b"
@@ -10,10 +10,13 @@ BandedMatrices = "aae01518-5342-5314-be14-df237901396f"
 ConcreteStructs = "2569d6c7-a4a2-43d3-a901-331e8e4be471"
 DiffEqBase = "2b5f629d-d688-5b77-993f-72d75c75574e"
 FastAlmostBandedMatrices = "9d29842c-ecb8-4973-b1e9-a27b1157504e"
+FastClosures = "9aa1b823-49e4-5ca5-8b0f-3971ec8bab6a"
 ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 LinearSolve = "7ed4a6bd-45f5-4d41-b270-4a48e9bafcae"
+Logging = "56ddb016-857b-54e1-b83d-db4d58db5568"
 NonlinearSolve = "8913a72c-1f9b-4ce2-8d82-65094dcecaec"
+OrdinaryDiffEq = "1dea7af3-3e70-54e6-95c3-0bf5283fa5ed"
 PreallocationTools = "d236fae5-4411-538c-8e31-a6e3d9e00b46"
 PrecompileTools = "aea7be01-6a6a-4083-8856-8a6e6704d82a"
 Preferences = "21216c6a-2e73-6563-6e65-726566657250"
@@ -23,57 +26,59 @@ SciMLBase = "0bca4576-84f4-4d90-8ffe-ffa030f20462"
 Setfield = "efcf1570-3423-57d1-acb7-fd33fddbac46"
 SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
 SparseDiffTools = "47a9eef4-7e08-11e9-0b38-333d64bd3804"
-Tricks = "410a4b4d-49e4-4fbc-ab6d-cb71b17b3775"
-TruncatedStacktraces = "781d530d-4396-4725-bb49-402e4bee1e77"
-UnPack = "3a884ed6-31ef-47d7-9d2a-63182c4928ed"
 
 [weakdeps]
 ODEInterface = "54ca160b-1b9f-5127-a996-1867f4bc2a2c"
-OrdinaryDiffEq = "1dea7af3-3e70-54e6-95c3-0bf5283fa5ed"
 
 [extensions]
 BoundaryValueDiffEqODEInterfaceExt = "ODEInterface"
-BoundaryValueDiffEqOrdinaryDiffEqExt = "OrdinaryDiffEq"
 
 [compat]
-ADTypes = "0.2"
-Adapt = "3, 4"
-Aqua = "0.7"
-ArrayInterface = "7"
-BandedMatrices = "1"
-ConcreteStructs = "0.2"
-DiffEqBase = "6.145"
-FastAlmostBandedMatrices = "0.1"
-ForwardDiff = "0.10"
-LinearAlgebra = "1.9"
-LinearSolve = "2.20"
-NonlinearSolve = "2.6.1, 3"
+ADTypes = "0.2.6"
+Adapt = "4"
+Aqua = "0.8"
+ArrayInterface = "7.7"
+BandedMatrices = "1.4"
+ConcreteStructs = "0.2.3"
+DiffEqBase = "6.146"
+DiffEqDevTools = "2.44"
+FastAlmostBandedMatrices = "0.1.1"
+FastClosures = "0.3"
+ForwardDiff = "0.10.36"
+JET = "0.8"
+LinearAlgebra = "1.10"
+LinearSolve = "2.21"
+Logging = "1.10"
+NonlinearSolve = "3.8.1"
 ODEInterface = "0.5"
-OrdinaryDiffEq = "6"
+OrdinaryDiffEq = "6.63"
 PreallocationTools = "0.4"
-PrecompileTools = "1"
-Preferences = "1"
-RecursiveArrayTools = "2.38.10, 3"
-Reexport = "0.2, 1.0"
-SciMLBase = "2.6.2"
+PrecompileTools = "1.2"
+Preferences = "1.4"
+Random = "1.10"
+ReTestItems = "1.23.1"
+RecursiveArrayTools = "3.4"
+Reexport = "1.2"
+SciMLBase = "2.31"
 Setfield = "1"
-SparseArrays = "1.9"
-SparseDiffTools = "2.9"
-Tricks = "0.1"
-TruncatedStacktraces = "1"
-UnPack = "1"
-julia = "1.9"
+SparseArrays = "1.10"
+SparseDiffTools = "2.14"
+StaticArrays = "1.8.1"
+Test = "1.10"
+julia = "1.10"
 
 [extras]
 Aqua = "4c88cf16-eb10-579e-8560-4a9242c79595"
 DiffEqDevTools = "f3b72e0c-5b89-59e1-b016-84e28bfd966d"
+JET = "c3a54625-cd67-489e-a8e7-0a5a0ff4e31b"
 LinearSolve = "7ed4a6bd-45f5-4d41-b270-4a48e9bafcae"
 ODEInterface = "54ca160b-1b9f-5127-a996-1867f4bc2a2c"
 OrdinaryDiffEq = "1dea7af3-3e70-54e6-95c3-0bf5283fa5ed"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
-SafeTestsets = "1bc83da4-3b8d-516f-aca4-4fe02f6d838f"
+ReTestItems = "817f1d60-ba6b-4fd5-9520-3cf149f6a823"
+RecursiveArrayTools = "731186ca-8d62-57ce-b412-fbd966d074cd"
 StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 
 [targets]
-test = ["StaticArrays", "Random", "DiffEqDevTools", "OrdinaryDiffEq", "Test", "SafeTestsets", "ODEInterface", "Aqua", "LinearSolve"]
+test = ["Aqua", "DiffEqDevTools", "JET", "LinearSolve", "ODEInterface", "OrdinaryDiffEq", "Random", "ReTestItems", "RecursiveArrayTools", "StaticArrays", "Test"]
diff --git a/README.md b/README.md
index 873ac7d9..523d1883 100644
--- a/README.md
+++ b/README.md
@@ -18,7 +18,7 @@ functionality should check out [DifferentialEquations.jl](https://github.com/Jul
 
 ## API
 
-BoundaryValueDiffEq.jl is part of the JuliaDiffEq common interface, but can be used independently of DifferentialEquations.jl. The only requirement is that the user passes a BoundaryValueDiffEq.jl algorithm to solve. For example, we can solve the [BVP tutorial from the documentation](https://docs.sciml.ai/DiffEqDocs/stable/tutorials/bvp_example/) using the `MIRK4()` algorithm:
+BoundaryValueDiffEq.jl is part of the SciML common interface, but can be used independently of DifferentialEquations.jl. The only requirement is that the user passes a BoundaryValueDiffEq.jl algorithm to solve. For example, we can solve the [BVP tutorial from the documentation](https://docs.sciml.ai/DiffEqDocs/stable/tutorials/bvp_example/) using the `MIRK4()` algorithm:
 
 ```julia
 using BoundaryValueDiffEq
@@ -46,18 +46,18 @@ For the list of available solvers, please refer to the [DifferentialEquations.jl
 Precompilation can be controlled via `Preferences.jl`
 
   - `PrecompileMIRK` -- Precompile the MIRK2 - MIRK6 algorithms (default: `true`).
-  - `PrecompileShooting` -- Precompile the single shooting algorithms (default: `false`). This is triggered when `OrdinaryDiffEq` is loaded.
-  - `PrecompileMultipleShooting` -- Precompile the multiple shooting algorithms (default: `false`). This is triggered when `OrdinaryDiffEq` is loaded.
-  - `PrecompileMIRKNLLS` -- Precompile the MIRK2 - MIRK6 algorithms for under-determined and over-determined BVPs (default: `false`).
-  - `PrecompileShootingNLLS` -- Precompile the single shooting algorithms for under-determined and over-determined BVPs (default: `false`). This is triggered when `OrdinaryDiffEq` is loaded.
-  - `PrecompileMultipleShootingNLLS` -- Precompile the multiple shooting algorithms for under-determined and over-determined BVPs (default: `false` ). This is triggered when `OrdinaryDiffEq` is loaded.
+  - `PrecompileShooting` -- Precompile the single shooting algorithms (default: `true`).
+  - `PrecompileMultipleShooting` -- Precompile the multiple shooting algorithms (default: `true`).
+  - `PrecompileMIRKNLLS` -- Precompile the MIRK2 - MIRK6 algorithms for under-determined and over-determined BVPs (default: `true`).
+  - `PrecompileShootingNLLS` -- Precompile the single shooting algorithms for under-determined and over-determined BVPs (default: `true`).
+  - `PrecompileMultipleShootingNLLS` -- Precompile the multiple shooting algorithms for under-determined and over-determined BVPs (default: `true` ).
 
 To set these preferences before loading the package, do the following (replacing `PrecompileShooting` with the preference you want to set, or pass in multiple pairs to set them together):
 
 ```julia
 using Preferences, UUIDs
-Preferences.set_preferences!(UUID("764a87c0-6b3e-53db-9096-fe964310641d"),
-    "PrecompileShooting" => false)
+Preferences.set_preferences!(
+    UUID("764a87c0-6b3e-53db-9096-fe964310641d"), "PrecompileShooting" => false)
 ```
 
 ## Running Benchmarks Locally
diff --git a/benchmark/simple_pendulum.jl b/benchmark/simple_pendulum.jl
index 8e9df3d0..ce09f1f0 100644
--- a/benchmark/simple_pendulum.jl
+++ b/benchmark/simple_pendulum.jl
@@ -44,42 +44,36 @@ function create_simple_pendulum_benchmark()
 
     if @isdefined(MultipleShooting)
         iip_suite["MultipleShooting(100, Tsit5; grid_coarsening = true)"] = @benchmarkable solve(
-            $SimplePendulumBenchmark.prob_iip,
-            $MultipleShooting(100, Tsit5()))
+            $SimplePendulumBenchmark.prob_iip, $MultipleShooting(100, Tsit5()))
         iip_suite["MultipleShooting(100, Tsit5; grid_coarsening = false)"] = @benchmarkable solve(
             $SimplePendulumBenchmark.prob_iip,
             $MultipleShooting(100, Tsit5(); grid_coarsening = false))
         iip_suite["MultipleShooting(10, Tsit5; grid_coarsening = true)"] = @benchmarkable solve(
-            $SimplePendulumBenchmark.prob_iip,
-            $MultipleShooting(10, Tsit5()))
+            $SimplePendulumBenchmark.prob_iip, $MultipleShooting(10, Tsit5()))
         iip_suite["MultipleShooting(10, Tsit5; grid_coarsening = false)"] = @benchmarkable solve(
             $SimplePendulumBenchmark.prob_iip,
             $MultipleShooting(10, Tsit5(); grid_coarsening = false))
     end
     if @isdefined(Shooting)
         iip_suite["Shooting(Tsit5())"] = @benchmarkable solve(
-            $SimplePendulumBenchmark.prob_iip,
-            $Shooting(Tsit5()))
+            $SimplePendulumBenchmark.prob_iip, $Shooting(Tsit5()))
     end
 
     if @isdefined(MultipleShooting)
         oop_suite["MultipleShooting(100, Tsit5; grid_coarsening = true)"] = @benchmarkable solve(
-            $SimplePendulumBenchmark.prob_oop,
-            $MultipleShooting(100, Tsit5()))
+            $SimplePendulumBenchmark.prob_oop, $MultipleShooting(100, Tsit5()))
         oop_suite["MultipleShooting(100, Tsit5; grid_coarsening = false)"] = @benchmarkable solve(
             $SimplePendulumBenchmark.prob_oop,
             $MultipleShooting(100, Tsit5(); grid_coarsening = false))
         oop_suite["MultipleShooting(10, Tsit5; grid_coarsening = true)"] = @benchmarkable solve(
-            $SimplePendulumBenchmark.prob_oop,
-            $MultipleShooting(10, Tsit5()))
+            $SimplePendulumBenchmark.prob_oop, $MultipleShooting(10, Tsit5()))
         oop_suite["MultipleShooting(10, Tsit5; grid_coarsening = false)"] = @benchmarkable solve(
             $SimplePendulumBenchmark.prob_oop,
             $MultipleShooting(10, Tsit5(); grid_coarsening = false))
     end
     if @isdefined(Shooting)
         oop_suite["Shooting(Tsit5())"] = @benchmarkable solve(
-            $SimplePendulumBenchmark.prob_oop,
-            $Shooting(Tsit5()))
+            $SimplePendulumBenchmark.prob_oop, $Shooting(Tsit5()))
     end
 
     return suite
diff --git a/ext/BoundaryValueDiffEqODEInterfaceExt.jl b/ext/BoundaryValueDiffEqODEInterfaceExt.jl
index 298b34e8..9e7a76ab 100644
--- a/ext/BoundaryValueDiffEqODEInterfaceExt.jl
+++ b/ext/BoundaryValueDiffEqODEInterfaceExt.jl
@@ -1,103 +1,159 @@
 module BoundaryValueDiffEqODEInterfaceExt
 
-using SciMLBase, BoundaryValueDiffEq, ODEInterface
-import SciMLBase: __solve
+using SciMLBase, BoundaryValueDiffEq, ODEInterface, RecursiveArrayTools
+import BoundaryValueDiffEq: __extract_u0, __flatten_initial_guess, __extract_mesh,
+                            __initial_guess_length, __initial_guess, __has_initial_guess
 import ODEInterface: OptionsODE, OPT_ATOL, OPT_RTOL, OPT_METHODCHOICE, OPT_DIAGNOSTICOUTPUT,
                      OPT_ERRORCONTROL, OPT_SINGULARTERM, OPT_MAXSTEPS, OPT_BVPCLASS,
-                     OPT_SOLMETHOD,
-                     OPT_RHS_CALLMODE, OPT_COLLOCATIONPTS, OPT_MAXSUBINTERVALS,
-                     RHS_CALL_INSITU, evalSolution
+                     OPT_SOLMETHOD, OPT_RHS_CALLMODE, OPT_COLLOCATIONPTS,
+                     OPT_MAXSUBINTERVALS, RHS_CALL_INSITU, evalSolution
 import ODEInterface: Bvpm2, bvpm2_init, bvpm2_solve, bvpm2_destroy, bvpm2_get_x
 import ODEInterface: bvpsol
 import ODEInterface: colnew
 
+import FastClosures: @closure
 import ForwardDiff
 
-function _test_bvpm2_bvpsol_colnew_problem_criteria(
-        _, ::SciMLBase.StandardBVProblem, alg::Symbol)
-    throw(ArgumentError("$(alg) does not support standard BVProblem. Only TwoPointBVProblem is supported."))
-end
-function _test_bvpm2_bvpsol_colnew_problem_criteria(prob, ::TwoPointBVProblem, alg::Symbol)
-    @assert isinplace(prob) "$(alg) only supports inplace TwoPointBVProblem!"
-end
-
 #------
 # BVPM2
 #------
-## TODO: We can specify Drhs using forwarddiff if we want to
 function SciMLBase.__solve(prob::BVProblem, alg::BVPM2; dt = 0.0, reltol = 1e-3, kwargs...)
-    _test_bvpm2_bvpsol_colnew_problem_criteria(prob, prob.problem_type, :BVPM2)
-
-    has_initial_guess = prob.u0 isa AbstractVector{<:AbstractArray}
-    no_odes, n, u0 = if has_initial_guess
-        length(first(prob.u0)), (length(prob.u0) - 1), reduce(hcat, prob.u0)
-    else
-        dt ≤ 0 && throw(ArgumentError("dt must be positive"))
-        length(prob.u0), Int(cld((prob.tspan[2] - prob.tspan[1]), dt)), prob.u0
+    if !(prob.problem_type isa TwoPointBVProblem)
+        throw(ArgumentError("`BVPM2` only supports `TwoPointBVProblem!`"))
     end
 
-    mesh = collect(range(prob.tspan[1], stop = prob.tspan[2], length = n + 1))
+    t₀, t₁ = prob.tspan
+    u0_ = __extract_u0(prob.u0, prob.p, t₀)
+    u0_size = size(u0_)
+    n = __initial_guess_length(prob.u0)
+
+    n == -1 && dt ≤ 0 && throw(ArgumentError("`dt` must be positive."))
 
-    no_left_bc = length(first(prob.f.bcresid_prototype.x))
+    mesh = __extract_mesh(prob.u0, t₀, t₁, ifelse(n == -1, dt, n - 1))
+    n = length(mesh) - 1
+    no_odes = length(u0_)
 
-    initial_guess = Bvpm2()
-    bvpm2_init(initial_guess, no_odes, no_left_bc, mesh, u0, eltype(u0)[],
-        alg.max_num_subintervals)
+    if prob.f.bcresid_prototype !== nothing
+        left_bc, right_bc = prob.f.bcresid_prototype.x
+        left_bc_size, right_bc_size = size(left_bc), size(right_bc)
+        no_left_bc = length(left_bc)
+    else
+        left_bc = prob.f.bc[1](u0_, prob.p) # Guaranteed to be out of place here
+        no_left_bc = length(left_bc)
+    end
 
-    bvp2m_f(t, u, du) = prob.f(du, u, prob.p, t)
-    function bvp2m_bc(ya, yb, bca, bcb)
-        prob.f.bc[1](bca, ya, prob.p)
-        prob.f.bc[2](bcb, yb, prob.p)
-        return nothing
+    obj = Bvpm2()
+    if prob.u0 isa Function
+        guess_function = @closure (x, y) -> (y .= vec(__initial_guess(prob.u0, prob.p, x)))
+        bvpm2_init(obj, no_odes, no_left_bc, mesh, guess_function,
+            eltype(u0_)[], alg.max_num_subintervals)
+    else
+        u0 = __flatten_initial_guess(prob.u0)
+        bvpm2_init(
+            obj, no_odes, no_left_bc, mesh, u0, eltype(u0)[], alg.max_num_subintervals)
+    end
+
+    bvp2m_f = if isinplace(prob)
+        @closure (t, u, du) -> prob.f(reshape(du, u0_size), reshape(u, u0_size), prob.p, t)
+    else
+        @closure (t, u, du) -> du .= vec(prob.f(reshape(u, u0_size), prob.p, t))
+    end
+    bvp2m_bc = if isinplace(prob)
+        @closure (ya, yb, bca, bcb) -> begin
+            prob.f.bc[1](reshape(bca, left_bc_size), reshape(ya, u0_size), prob.p)
+            prob.f.bc[2](reshape(bcb, right_bc_size), reshape(yb, u0_size), prob.p)
+            return nothing
+        end
+    else
+        @closure (ya, yb, bca, bcb) -> begin
+            bca .= vec(prob.f.bc[1](reshape(ya, u0_size), prob.p))
+            bcb .= vec(prob.f.bc[2](reshape(yb, u0_size), prob.p))
+            return nothing
+        end
     end
 
     opt = OptionsODE(OPT_RTOL => reltol, OPT_METHODCHOICE => alg.method_choice,
         OPT_DIAGNOSTICOUTPUT => alg.diagnostic_output,
         OPT_SINGULARTERM => alg.singular_term, OPT_ERRORCONTROL => alg.error_control)
 
-    sol, retcode, stats = bvpm2_solve(initial_guess, bvp2m_f, bvp2m_bc, opt)
+    sol, retcode, stats = bvpm2_solve(obj, bvp2m_f, bvp2m_bc, opt)
     retcode = retcode ≥ 0 ? ReturnCode.Success : ReturnCode.Failure
     destats = SciMLBase.DEStats(
         stats["no_rhs_calls"], 0, 0, 0, stats["no_jac_calls"], 0, 0, 0, 0, 0, 0, 0, 0)
 
     x_mesh = bvpm2_get_x(sol)
     evalsol = evalSolution(sol, x_mesh)
-    sol_final = DiffEqBase.build_solution(prob, alg, x_mesh,
-        collect(Vector{eltype(evalsol)}, eachcol(evalsol)); retcode, stats = destats)
+    ivpsol = SciMLBase.build_solution(prob, alg, x_mesh,
+        map(x -> reshape(convert(Vector{eltype(evalsol)}, x), u0_size), eachcol(evalsol));
+        retcode, stats = destats, original = (sol, retcode, stats))
 
-    bvpm2_destroy(initial_guess)
+    bvpm2_destroy(obj)
     bvpm2_destroy(sol)
 
-    return sol_final
+    return ivpsol
 end
 
 #-------
 # BVPSOL
 #-------
-function SciMLBase.__solve(prob::BVProblem, alg::BVPSOL; maxiters = 1000, reltol = 1e-3,
-        dt = 0.0, verbose = true, kwargs...)
-    _test_bvpm2_bvpsol_colnew_problem_criteria(prob, prob.problem_type, :BVPSOL)
-    @assert isa(prob.p, SciMLBase.NullParameters) "BVPSOL only supports NullParameters!"
-    @assert isa(prob.u0, AbstractVector{<:AbstractArray}) "BVPSOL requires a vector of initial guesses!"
-    n, u0 = (length(prob.u0) - 1), reduce(hcat, prob.u0)
-    mesh = collect(range(prob.tspan[1], stop = prob.tspan[2], length = n + 1))
-
-    opt = OptionsODE(OPT_RTOL => reltol, OPT_MAXSTEPS => maxiters,
-        OPT_BVPCLASS => alg.bvpclass, OPT_SOLMETHOD => alg.sol_method,
-        OPT_RHS_CALLMODE => RHS_CALL_INSITU)
-
-    f!(t, u, du) = prob.f(du, u, prob.p, t)
-    function bc!(ya, yb, r)
-        ra = first(prob.f.bcresid_prototype.x)
-        rb = last(prob.f.bcresid_prototype.x)
-        prob.f.bc[1](ra, ya, prob.p)
-        prob.f.bc[2](rb, yb, prob.p)
-        r[1:length(ra)] .= ra
-        r[(length(ra) + 1):(length(ra) + length(rb))] .= rb
-        return r
-    end
-
-    sol_t, sol_x, retcode, stats = bvpsol(f!, bc!, mesh, u0, alg.odesolver, opt)
+function SciMLBase.__solve(prob::BVProblem, alg::BVPSOL; maxiters = 1000,
+        reltol = 1e-3, dt = 0.0, verbose = true, kwargs...)
+    if !(prob.problem_type isa TwoPointBVProblem)
+        throw(ArgumentError("`BVPSOL` only supports `TwoPointBVProblem!`"))
+    end
+    if !__has_initial_guess(prob.u0)
+        throw(ArgumentError("Initial Guess is required for `BVPSOL`"))
+    end
+
+    t₀, t₁ = prob.tspan
+    u0_ = __extract_u0(prob.u0, prob.p, t₀)
+    u0_size = size(u0_)
+    n = __initial_guess_length(prob.u0)
+
+    n == -1 && dt ≤ 0 && throw(ArgumentError("`dt` must be positive."))
+    u0 = __flatten_initial_guess(prob.u0)
+    mesh = __extract_mesh(prob.u0, t₀, t₁, ifelse(n == -1, dt, n - 1))
+    if u0 === nothing
+        # initial_guess function was provided
+        u0 = mapreduce(@closure(t->vec(__initial_guess(prob.u0, prob.p, t))), hcat, mesh)
+    end
+
+    if prob.f.bcresid_prototype !== nothing
+        left_bc, right_bc = prob.f.bcresid_prototype.x
+        left_bc_size, right_bc_size = size(left_bc), size(right_bc)
+        no_left_bc = length(left_bc)
+    else
+        left_bc = prob.f.bc[1](u0_, prob.p) # Guaranteed to be out of place here
+        no_left_bc = length(left_bc)
+    end
+
+    opt = OptionsODE(
+        OPT_RTOL => reltol, OPT_MAXSTEPS => maxiters, OPT_BVPCLASS => alg.bvpclass,
+        OPT_SOLMETHOD => alg.sol_method, OPT_RHS_CALLMODE => RHS_CALL_INSITU)
+
+    bvpsol_f = if isinplace(prob)
+        @closure (t, u, du) -> prob.f(reshape(du, u0_size), reshape(u, u0_size), prob.p, t)
+    else
+        @closure (t, u, du) -> du .= vec(prob.f(reshape(u, u0_size), prob.p, t))
+    end
+
+    bvpsol_bc = if isinplace(prob)
+        @closure (ya, yb, r) -> begin
+            left_bc = reshape(@view(r[1:no_left_bc]), left_bc_size)
+            right_bc = reshape(@view(r[(no_left_bc + 1):end]), right_bc_size)
+            prob.f.bc[1](left_bc, reshape(ya, u0_size), prob.p)
+            prob.f.bc[2](right_bc, reshape(yb, u0_size), prob.p)
+            return nothing
+        end
+    else
+        @closure (ya, yb, r) -> begin
+            r[1:no_left_bc] .= vec(prob.f.bc[1](reshape(ya, u0_size), prob.p))
+            r[(no_left_bc + 1):end] .= vec(prob.f.bc[2](reshape(yb, u0_size), prob.p))
+            return nothing
+        end
+    end
+
+    sol_t, sol_x, retcode, stats = bvpsol(bvpsol_f, bvpsol_bc, mesh, u0, alg.odesolver, opt)
 
     if verbose
         if retcode == -3
@@ -107,9 +163,9 @@ function SciMLBase.__solve(prob::BVProblem, alg::BVPSOL; maxiters = 1000, reltol
         elseif retcode == -5
             @warn "Given initial values inconsistent with separable linear bc"
         elseif retcode == -6
-            @warn """Iterative refinement faild to converge for `sol_method=0`
-            Termination since multiple shooting condition or
-            condition of Jacobian is too bad for `sol_method=1`"""
+            @warn "Iterative refinement faild to converge for `sol_method=0` \
+                   Termination since multiple shooting condition or \
+                   condition of Jacobian is too bad for `sol_method=1`"
         elseif retcode == -8
             @warn "Condensing algorithm for linear block system fails, try `sol_method=1`"
         elseif retcode == -9
@@ -121,67 +177,94 @@ function SciMLBase.__solve(prob::BVProblem, alg::BVPSOL; maxiters = 1000, reltol
         end
     end
 
-    return DiffEqBase.build_solution(prob, alg, sol_t,
-        collect(Vector{eltype(sol_x)}, eachcol(sol_x));
-        retcode = retcode ≥ 0 ? ReturnCode.Success : ReturnCode.Failure, stats)
+    ivpsol = SciMLBase.build_solution(prob, alg, sol_t,
+        map(x -> reshape(convert(Vector{eltype(u0_)}, x), u0_size), eachcol(sol_x));
+        retcode = retcode ≥ 0 ? ReturnCode.Success : ReturnCode.Failure,
+        stats, original = (sol_t, sol_x, retcode, stats))
+
+    return ivpsol
 end
 
 #-------
 # COLNEW
 #-------
 function SciMLBase.__solve(prob::BVProblem, alg::COLNEW; maxiters = 1000,
-        reltol = 1e-4, dt = 0.0, verbose = true, kwargs...)
-    _test_bvpm2_bvpsol_colnew_problem_criteria(prob, prob.problem_type, :COLNEW)
-    has_initial_guess = prob.u0 isa AbstractVector{<:AbstractArray}
-    dt ≤ 0 && throw(ArgumentError("dt must be positive"))
-    no_odes, n, u0 = if has_initial_guess
-        length(first(prob.u0)), (length(prob.u0) - 1), reduce(hcat, prob.u0)
-    else
-        length(prob.u0), Int(cld((prob.tspan[2] - prob.tspan[1]), dt)), prob.u0
+        reltol = 1e-3, dt = 0.0, verbose = true, kwargs...)
+    # FIXME: COLNEW does support MP-BVPs but in a very clunky way
+    if !(prob.problem_type isa TwoPointBVProblem)
+        throw(ArgumentError("`COLNEW` only supports `TwoPointBVProblem!`"))
+    end
+
+    dt ≤ 0 && throw(ArgumentError("`dt` must be positive"))
+
+    t₀, t₁ = prob.tspan
+    u0_ = __extract_u0(prob.u0, prob.p, t₀)
+    u0_size = size(u0_)
+    n = __initial_guess_length(prob.u0)
+
+    u0 = __flatten_initial_guess(prob.u0)
+    mesh = __extract_mesh(prob.u0, t₀, t₁, ifelse(n == -1, dt, n - 1))
+    if u0 === nothing
+        # initial_guess function was provided
+        u0 = mapreduce(@closure(t->vec(__initial_guess(prob.u0, prob.p, t))), hcat, mesh)
     end
+
+    no_odes = length(u0_)
+
+    # has_initial_guess = prob.u0 isa AbstractVector{<:AbstractArray}
+    # dt ≤ 0 && throw(ArgumentError("dt must be positive"))
+    # no_odes, n, u0 = if has_initial_guess
+    #     length(first(prob.u0)), (length(prob.u0) - 1), reduce(hcat, prob.u0)
+    # else
+    #     length(prob.u0), Int(cld((prob.tspan[2] - prob.tspan[1]), dt)), prob.u0
+    # end
+
     T = eltype(u0)
-    mesh = collect(range(prob.tspan[1], stop = prob.tspan[2], length = n + 1))
-    opt = OptionsODE(
-        OPT_BVPCLASS => alg.bvpclass, OPT_COLLOCATIONPTS => alg.collocationpts,
+    # mesh = collect(range(prob.tspan[1], stop = prob.tspan[2], length = n + 1))
+    opt = OptionsODE(OPT_BVPCLASS => alg.bvpclass, OPT_COLLOCATIONPTS => alg.collocationpts,
         OPT_MAXSTEPS => maxiters, OPT_DIAGNOSTICOUTPUT => alg.diagnostic_output,
         OPT_MAXSUBINTERVALS => alg.max_num_subintervals, OPT_RTOL => reltol)
     orders = ones(Int, no_odes)
     _tspan = [prob.tspan[1], prob.tspan[2]]
     iip = SciMLBase.isinplace(prob)
 
-    rhs(t, u, du) =
+    rhs = @closure (t, u, du) -> begin
         if iip
             prob.f(du, u, prob.p, t)
         else
             (du .= prob.f(u, prob.p, t))
         end
+    end
 
     if prob.f.jac === nothing
         if iip
-            jac = function (df, u, p, t)
+            jac = (df, u, p, t) -> begin
                 _du = similar(u)
                 prob.f(_du, u, p, t)
-                _f = (du, u) -> prob.f(du, u, p, t)
+                _f = @closure (du, u) -> prob.f(du, u, p, t)
                 ForwardDiff.jacobian!(df, _f, _du, u)
+                return
             end
         else
-            jac = function (df, u, p, t)
+            jac = (df, u, p, t) -> begin
                 _du = prob.f(u, p, t)
-                _f = (du, u) -> (du .= prob.f(u, p, t))
+                _f = @closure (du, u) -> (du .= prob.f(u, p, t))
                 ForwardDiff.jacobian!(df, _f, _du, u)
+                return
             end
         end
     else
         jac = prob.f.jac
     end
-    Drhs(t, u, df) = jac(df, u, prob.p, t)
+    Drhs = @closure (t, u, df) -> jac(df, u, prob.p, t)
 
-    #TODO: Fix bc and bcjac for multi-points BVP
+    bcresid_prototype, _ = BoundaryValueDiffEq.__get_bcresid_prototype(
+        prob.problem_type, prob, u0)
 
-    n_bc_a = length(first(prob.f.bcresid_prototype.x))
-    n_bc_b = length(last(prob.f.bcresid_prototype.x))
+    n_bc_a = length(first(bcresid_prototype))
+    n_bc_b = length(last(bcresid_prototype))
     zeta = vcat(fill(first(prob.tspan), n_bc_a), fill(last(prob.tspan), n_bc_b))
-    bc = function (i, z, resid)
+    bc = @closure (i, z, resid) -> begin
         tmpa = copy(z)
         tmpb = copy(z)
         tmp_resid_a = zeros(T, n_bc_a)
@@ -201,7 +284,7 @@ function SciMLBase.__solve(prob::BVProblem, alg::COLNEW; maxiters = 1000,
         end
     end
 
-    Dbc = function (i, z, dbc)
+    Dbc = @closure (i, z, dbc) -> begin
         for j in 1:n_bc_a
             if i == j
                 dbc[i] = 1.0
@@ -220,7 +303,8 @@ function SciMLBase.__solve(prob::BVProblem, alg::COLNEW; maxiters = 1000,
         if retcode == 0
             @warn "Collocation matrix is singular"
         elseif retcode == -1
-            @warn "The expected no. of subintervals exceeds storage(try to increase `OPT_MAXSUBINTERVALS`)"
+            @warn "The expected no. of subintervals exceeds storage(try to increase \
+                   `OPT_MAXSUBINTERVALS`)"
         elseif retcode == -2
             @warn "The nonlinear iteration has not converged"
         elseif retcode == -3
@@ -232,10 +316,10 @@ function SciMLBase.__solve(prob::BVProblem, alg::COLNEW; maxiters = 1000,
     destats = SciMLBase.DEStats(
         stats["no_rhs_calls"], 0, 0, 0, stats["no_jac_calls"], 0, 0, 0, 0, 0, 0, 0, 0)
 
-    return DiffEqBase.build_solution(prob, alg, mesh,
-        collect(Vector{eltype(evalsol)}, eachrow(evalsol));
+    return DiffEqBase.build_solution(
+        prob, alg, mesh, collect(Vector{eltype(evalsol)}, eachrow(evalsol));
         retcode = retcode > 0 ? ReturnCode.Success : ReturnCode.Failure,
-        stats = destats)
+        stats = destats, original = (sol, retcode, stats))
 end
 
 end
diff --git a/ext/BoundaryValueDiffEqOrdinaryDiffEqExt.jl b/ext/BoundaryValueDiffEqOrdinaryDiffEqExt.jl
deleted file mode 100644
index b03d47b5..00000000
--- a/ext/BoundaryValueDiffEqOrdinaryDiffEqExt.jl
+++ /dev/null
@@ -1,143 +0,0 @@
-module BoundaryValueDiffEqOrdinaryDiffEqExt
-
-# This extension doesn't add any new feature atm but is used to precompile some common
-# shooting workflows
-
-# We can't use @load_preference since this is a different module
-import Preferences: load_preference
-import PrecompileTools: @compile_workload, @setup_workload, @recompile_invalidations
-
-@recompile_invalidations begin
-    using BoundaryValueDiffEq, OrdinaryDiffEq
-end
-
-@setup_workload begin
-    function f1!(du, u, p, t)
-        du[1] = u[2]
-        du[2] = 0
-    end
-    f1(u, p, t) = [u[2], 0]
-
-    function bc1!(residual, u, p, t)
-        residual[1] = u(0.0)[1] - 5
-        residual[2] = u(5.0)[1]
-    end
-    bc1(u, p, t) = [u(0.0)[1] - 5, u(5.0)[1]]
-
-    bc1_a!(residual, ua, p) = (residual[1] = ua[1] - 5)
-    bc1_b!(residual, ub, p) = (residual[1] = ub[1])
-
-    bc1_a(ua, p) = [ua[1] - 5]
-    bc1_b(ub, p) = [ub[1]]
-
-    tspan = (0.0, 5.0)
-    u0 = [5.0, -3.5]
-    bcresid_prototype = (Array{Float64}(undef, 1), Array{Float64}(undef, 1))
-
-    probs = [
-        BVProblem(f1!, bc1!, u0, tspan),
-        BVProblem(f1, bc1, u0, tspan),
-        TwoPointBVProblem(f1!, (bc1_a!, bc1_b!), u0, tspan; bcresid_prototype),
-        TwoPointBVProblem(f1, (bc1_a, bc1_b), u0, tspan; bcresid_prototype)
-    ]
-
-    algs = []
-
-    if load_preference(BoundaryValueDiffEq, "PrecompileShooting", false)
-        push!(algs,
-            Shooting(Tsit5(); nlsolve = NewtonRaphson(),
-                jac_alg = BVPJacobianAlgorithm(AutoForwardDiff(; chunksize = 2))))
-    end
-
-    if load_preference(BoundaryValueDiffEq, "PrecompileMultipleShooting", false)
-        push!(algs,
-            MultipleShooting(10,
-                Tsit5();
-                nlsolve = NewtonRaphson(; autodiff = AutoForwardDiff(chunksize = 2)),
-                jac_alg = BVPJacobianAlgorithm(;
-                    bc_diffmode = AutoForwardDiff(; chunksize = 2),
-                    nonbc_diffmode = AutoSparseForwardDiff(; chunksize = 2))))
-    end
-
-    @compile_workload begin
-        for prob in probs, alg in algs
-            solve(prob, alg)
-        end
-    end
-
-    function f1_nlls!(du, u, p, t)
-        du[1] = u[2]
-        du[2] = -u[1]
-    end
-
-    f1_nlls(u, p, t) = [u[2], -u[1]]
-
-    function bc1_nlls!(resid, sol, p, t)
-        solₜ₁ = sol(0.0)
-        solₜ₂ = sol(100.0)
-        resid[1] = solₜ₁[1]
-        resid[2] = solₜ₂[1] - 1
-        resid[3] = solₜ₂[2] + 1.729109
-        return nothing
-    end
-    bc1_nlls(sol, p, t) = [sol(0.0)[1], sol(100.0)[1] - 1, sol(1.0)[2] + 1.729109]
-
-    bc1_nlls_a!(resid, ua, p) = (resid[1] = ua[1])
-    bc1_nlls_b!(resid, ub, p) = (resid[1] = ub[1] - 1; resid[2] = ub[2] + 1.729109)
-
-    bc1_nlls_a(ua, p) = [ua[1]]
-    bc1_nlls_b(ub, p) = [ub[1] - 1, ub[2] + 1.729109]
-
-    tspan = (0.0, 100.0)
-    u0 = [0.0, 1.0]
-    bcresid_prototype1 = Array{Float64}(undef, 3)
-    bcresid_prototype2 = (Array{Float64}(undef, 1), Array{Float64}(undef, 2))
-
-    probs = [
-        BVProblem(BVPFunction(f1_nlls!, bc1_nlls!; bcresid_prototype = bcresid_prototype1),
-            u0, tspan),
-        BVProblem(BVPFunction(f1_nlls, bc1_nlls; bcresid_prototype = bcresid_prototype1),
-            u0, tspan),
-        TwoPointBVProblem(f1_nlls!, (bc1_nlls_a!, bc1_nlls_b!), u0, tspan;
-            bcresid_prototype = bcresid_prototype2),
-        TwoPointBVProblem(f1_nlls, (bc1_nlls_a, bc1_nlls_b), u0, tspan;
-            bcresid_prototype = bcresid_prototype2)
-    ]
-
-    algs = []
-
-    if load_preference(BoundaryValueDiffEq, "PrecompileShootingNLLS", false)
-        append!(algs,
-            [
-                Shooting(Tsit5(); nlsolve = LevenbergMarquardt(),
-                    jac_alg = BVPJacobianAlgorithm(AutoForwardDiff(; chunksize = 2))),
-                Shooting(Tsit5(); nlsolve = GaussNewton(),
-                    jac_alg = BVPJacobianAlgorithm(AutoForwardDiff(; chunksize = 2)))
-            ])
-    end
-
-    if load_preference(BoundaryValueDiffEq, "PrecompileMultipleShootingNLLS", false)
-        append!(algs,
-            [
-                MultipleShooting(10, Tsit5();
-                    nlsolve = LevenbergMarquardt(;
-                        autodiff = AutoForwardDiff(chunksize = 2)),
-                    jac_alg = BVPJacobianAlgorithm(;
-                        bc_diffmode = AutoForwardDiff(; chunksize = 2),
-                        nonbc_diffmode = AutoSparseForwardDiff(; chunksize = 2))),
-                MultipleShooting(10, Tsit5();
-                    nlsolve = GaussNewton(; autodiff = AutoForwardDiff(chunksize = 2)),
-                    jac_alg = BVPJacobianAlgorithm(;
-                        bc_diffmode = AutoForwardDiff(; chunksize = 2),
-                        nonbc_diffmode = AutoSparseForwardDiff(; chunksize = 2)))
-            ])
-    end
-
-    @compile_workload begin
-        for prob in probs, alg in algs
-            solve(prob, alg)
-        end
-    end
-end
-
-end
diff --git a/src/BoundaryValueDiffEq.jl b/src/BoundaryValueDiffEq.jl
index 9a5cdb91..3710ae6a 100644
--- a/src/BoundaryValueDiffEq.jl
+++ b/src/BoundaryValueDiffEq.jl
@@ -4,27 +4,29 @@ import PrecompileTools: @compile_workload, @setup_workload, @recompile_invalidat
 
 @recompile_invalidations begin
     using ADTypes, Adapt, DiffEqBase, ForwardDiff, LinearAlgebra, NonlinearSolve,
-          PreallocationTools, Preferences, RecursiveArrayTools, Reexport, SciMLBase,
-          Setfield,
-          SparseDiffTools, Tricks
+          OrdinaryDiffEq, Preferences, RecursiveArrayTools, Reexport, SciMLBase, Setfield,
+          SparseDiffTools
+
+    using PreallocationTools: PreallocationTools, DiffCache
 
     # Special Matrix Types
     using BandedMatrices, FastAlmostBandedMatrices, SparseArrays
 
     import ADTypes: AbstractADType
-    import ArrayInterface: matrix_colors,
-                           parameterless_type, undefmatrix, fast_scalar_indexing
+    import ArrayInterface: matrix_colors, parameterless_type, undefmatrix,
+                           fast_scalar_indexing
     import ConcreteStructs: @concrete
     import DiffEqBase: solve
-    import ForwardDiff: pickchunksize
+    import FastClosures: @closure
+    import ForwardDiff: ForwardDiff, pickchunksize
+    import Logging
     import RecursiveArrayTools: ArrayPartition, DiffEqArray
     import SciMLBase: AbstractDiffEqInterpolation, StandardBVProblem, __solve, _unwrap_val
     import SparseDiffTools: AbstractSparseADType
-    import TruncatedStacktraces: @truncate_stacktrace
-    import UnPack: @unpack
 end
 
-@reexport using ADTypes, DiffEqBase, NonlinearSolve, SparseDiffTools, SciMLBase
+@reexport using ADTypes, DiffEqBase, NonlinearSolve, OrdinaryDiffEq, SparseDiffTools,
+                SciMLBase
 
 include("types.jl")
 include("utils.jl")
@@ -43,49 +45,50 @@ include("sparse_jacobians.jl")
 include("adaptivity.jl")
 include("interpolation.jl")
 
+include("default_nlsolve.jl")
+
 function __solve(prob::BVProblem, alg::BoundaryValueDiffEqAlgorithm, args...; kwargs...)
     cache = init(prob, alg, args...; kwargs...)
     return solve!(cache)
 end
 
 @setup_workload begin
-    function f1!(du, u, p, t)
+    f1! = @closure (du, u, p, t) -> begin
         du[1] = u[2]
         du[2] = 0
     end
-    f1(u, p, t) = [u[2], 0]
+    f1 = @closure (u, p, t) -> [u[2], 0]
 
-    function bc1!(residual, u, p, t)
+    bc1! = @closure (residual, u, p, t) -> begin
         residual[1] = u[1][1] - 5
-        residual[2] = u[end][1]
+        residual[2] = u[lastindex(u)][1]
     end
-    bc1(u, p, t) = [u[1][1] - 5, u[end][1]]
 
-    bc1_a!(residual, ua, p) = (residual[1] = ua[1] - 5)
-    bc1_b!(residual, ub, p) = (residual[1] = ub[1])
+    bc1 = @closure (u, p, t) -> [u[1][1] - 5, u[lastindex(u)][1]]
 
-    bc1_a(ua, p) = [ua[1] - 5]
-    bc1_b(ub, p) = [ub[1]]
+    bc1_a! = @closure (residual, ua, p) -> (residual[1] = ua[1] - 5)
+    bc1_b! = @closure (residual, ub, p) -> (residual[1] = ub[1])
+
+    bc1_a = @closure (ua, p) -> [ua[1] - 5]
+    bc1_b = @closure (ub, p) -> [ub[1]]
 
     tspan = (0.0, 5.0)
     u0 = [5.0, -3.5]
     bcresid_prototype = (Array{Float64}(undef, 1), Array{Float64}(undef, 1))
 
-    probs = [
-        BVProblem(f1!, bc1!, u0, tspan),
-        BVProblem(f1, bc1, u0, tspan),
-        TwoPointBVProblem(f1!, (bc1_a!, bc1_b!), u0, tspan; bcresid_prototype),
-        TwoPointBVProblem(f1, (bc1_a, bc1_b), u0, tspan; bcresid_prototype)
-    ]
+    probs = [BVProblem(f1!, bc1!, u0, tspan; nlls = Val(false)),
+        BVProblem(f1, bc1, u0, tspan; nlls = Val(false)),
+        TwoPointBVProblem(
+            f1!, (bc1_a!, bc1_b!), u0, tspan; bcresid_prototype, nlls = Val(false)),
+        TwoPointBVProblem(
+            f1, (bc1_a, bc1_b), u0, tspan; bcresid_prototype, nlls = Val(false))]
 
     algs = []
 
     jac_alg = BVPJacobianAlgorithm(AutoForwardDiff(; chunksize = 2))
 
     if Preferences.@load_preference("PrecompileMIRK", true)
-        append!(algs,
-            [MIRK2(; jac_alg), MIRK3(; jac_alg), MIRK4(; jac_alg),
-                MIRK5(; jac_alg), MIRK6(; jac_alg)])
+        append!(algs, [MIRK2(; jac_alg), MIRK4(; jac_alg), MIRK6(; jac_alg)])
     end
 
     @compile_workload begin
@@ -94,28 +97,30 @@ end
         end
     end
 
-    function f1_nlls!(du, u, p, t)
+    f1_nlls! = @closure (du, u, p, t) -> begin
         du[1] = u[2]
         du[2] = -u[1]
     end
 
-    f1_nlls(u, p, t) = [u[2], -u[1]]
+    f1_nlls = @closure (u, p, t) -> [u[2], -u[1]]
 
-    function bc1_nlls!(resid, sol, p, t)
+    bc1_nlls! = @closure (resid, sol, p, t) -> begin
         solₜ₁ = sol[1]
-        solₜ₂ = sol[end]
+        solₜ₂ = sol[lastindex(sol)]
         resid[1] = solₜ₁[1]
         resid[2] = solₜ₂[1] - 1
         resid[3] = solₜ₂[2] + 1.729109
         return nothing
     end
-    bc1_nlls(sol, p, t) = [sol[1][1], sol[end][1] - 1, sol[end][2] + 1.729109]
+    bc1_nlls = @closure (sol, p, t) -> [
+        sol[1][1], sol[lastindex(sol)][1] - 1, sol[lastindex(sol)][2] + 1.729109]
 
-    bc1_nlls_a!(resid, ua, p) = (resid[1] = ua[1])
-    bc1_nlls_b!(resid, ub, p) = (resid[1] = ub[1] - 1; resid[2] = ub[2] + 1.729109)
+    bc1_nlls_a! = @closure (resid, ua, p) -> (resid[1] = ua[1])
+    bc1_nlls_b! = @closure (resid, ub, p) -> (resid[1] = ub[1] - 1;
+    resid[2] = ub[2] + 1.729109)
 
-    bc1_nlls_a(ua, p) = [ua[1]]
-    bc1_nlls_b(ub, p) = [ub[1] - 1, ub[2] + 1.729109]
+    bc1_nlls_a = @closure (ua, p) -> [ua[1]]
+    bc1_nlls_b = @closure (ub, p) -> [ub[1] - 1, ub[2] + 1.729109]
 
     tspan = (0.0, 100.0)
     u0 = [0.0, 1.0]
@@ -124,43 +129,154 @@ end
 
     probs = [
         BVProblem(BVPFunction(f1_nlls!, bc1_nlls!; bcresid_prototype = bcresid_prototype1),
-            u0, tspan),
+            u0, tspan, nlls = Val(true)),
         BVProblem(BVPFunction(f1_nlls, bc1_nlls; bcresid_prototype = bcresid_prototype1),
-            u0, tspan),
+            u0, tspan, nlls = Val(true)),
         TwoPointBVProblem(f1_nlls!, (bc1_nlls_a!, bc1_nlls_b!), u0, tspan;
-            bcresid_prototype = bcresid_prototype2),
+            bcresid_prototype = bcresid_prototype2, nlls = Val(true)),
         TwoPointBVProblem(f1_nlls, (bc1_nlls_a, bc1_nlls_b), u0, tspan;
-            bcresid_prototype = bcresid_prototype2)
-    ]
+            bcresid_prototype = bcresid_prototype2, nlls = Val(true))]
 
     jac_alg = BVPJacobianAlgorithm(AutoForwardDiff(; chunksize = 2))
 
-    nlsolvers = [LevenbergMarquardt(), GaussNewton()]
+    nlsolvers = [LevenbergMarquardt(; disable_geodesic = Val(true)), GaussNewton()]
 
     algs = []
 
     if Preferences.@load_preference("PrecompileMIRKNLLS", false)
         for nlsolve in nlsolvers
-            append!(algs,
-                [
-                    MIRK2(; jac_alg, nlsolve), MIRK3(; jac_alg, nlsolve),
-                    MIRK4(; jac_alg, nlsolve), MIRK5(; jac_alg, nlsolve),
-                    MIRK6(; jac_alg, nlsolve)
-                ])
+            append!(algs, [MIRK2(; jac_alg, nlsolve), MIRK6(; jac_alg, nlsolve)])
         end
     end
 
     @compile_workload begin
         for prob in probs, alg in algs
-            solve(prob, alg; dt = 0.2)
+            solve(prob, alg; dt = 0.2, abstol = 1e-2)
+        end
+    end
+
+    bc1! = @closure (residual, u, p, t) -> begin
+        residual[1] = u(0.0)[1] - 5
+        residual[2] = u(5.0)[1]
+    end
+    bc1 = @closure (u, p, t) -> [u(0.0)[1] - 5, u(5.0)[1]]
+
+    tspan = (0.0, 5.0)
+    u0 = [5.0, -3.5]
+    bcresid_prototype = (Array{Float64}(undef, 1), Array{Float64}(undef, 1))
+
+    probs = [BVProblem(BVPFunction{true}(f1!, bc1!), u0, tspan; nlls = Val(false)),
+        BVProblem(BVPFunction{false}(f1, bc1), u0, tspan; nlls = Val(false)),
+        BVProblem(
+            BVPFunction{true}(
+                f1!, (bc1_a!, bc1_b!); bcresid_prototype, twopoint = Val(true)),
+            u0,
+            tspan;
+            nlls = Val(false)),
+        BVProblem(
+            BVPFunction{false}(f1, (bc1_a, bc1_b); bcresid_prototype, twopoint = Val(true)),
+            u0, tspan; nlls = Val(false))]
+
+    algs = []
+
+    if @load_preference("PrecompileShooting", true)
+        push!(algs,
+            Shooting(Tsit5(); nlsolve = NewtonRaphson(),
+                jac_alg = BVPJacobianAlgorithm(AutoForwardDiff(; chunksize = 2))))
+    end
+
+    if @load_preference("PrecompileMultipleShooting", true)
+        push!(algs,
+            MultipleShooting(10,
+                Tsit5();
+                nlsolve = NewtonRaphson(),
+                jac_alg = BVPJacobianAlgorithm(;
+                    bc_diffmode = AutoForwardDiff(; chunksize = 2),
+                    nonbc_diffmode = AutoSparseForwardDiff(; chunksize = 2))))
+    end
+
+    @compile_workload begin
+        for prob in probs, alg in algs
+            solve(prob, alg)
+        end
+    end
+
+    bc1_nlls! = @closure (resid, sol, p, t) -> begin
+        solₜ₁ = sol(0.0)
+        solₜ₂ = sol(100.0)
+        resid[1] = solₜ₁[1]
+        resid[2] = solₜ₂[1] - 1
+        resid[3] = solₜ₂[2] + 1.729109
+        return nothing
+    end
+    bc1_nlls = @closure (sol, p, t) -> [
+        sol(0.0)[1], sol(100.0)[1] - 1, sol(1.0)[2] + 1.729109]
+
+    tspan = (0.0, 100.0)
+    u0 = [0.0, 1.0]
+    bcresid_prototype1 = Array{Float64}(undef, 3)
+    bcresid_prototype2 = (Array{Float64}(undef, 1), Array{Float64}(undef, 2))
+
+    probs = [
+        BVProblem(
+            BVPFunction{true}(f1_nlls!, bc1_nlls!; bcresid_prototype = bcresid_prototype1),
+            u0, tspan; nlls = Val(true)),
+        BVProblem(
+            BVPFunction{false}(f1_nlls, bc1_nlls; bcresid_prototype = bcresid_prototype1),
+            u0, tspan; nlls = Val(true)),
+        BVProblem(
+            BVPFunction{true}(f1_nlls!, (bc1_nlls_a!, bc1_nlls_b!);
+                bcresid_prototype = bcresid_prototype2, twopoint = Val(true)),
+            u0,
+            tspan;
+            nlls = Val(true)),
+        BVProblem(
+            BVPFunction{false}(f1_nlls, (bc1_nlls_a, bc1_nlls_b);
+                bcresid_prototype = bcresid_prototype2, twopoint = Val(true)),
+            u0,
+            tspan;
+            nlls = Val(true))]
+
+    algs = []
+
+    if @load_preference("PrecompileShootingNLLS", true)
+        append!(algs,
+            [
+                Shooting(
+                    Tsit5(); nlsolve = LevenbergMarquardt(; disable_geodesic = Val(true)),
+                    jac_alg = BVPJacobianAlgorithm(AutoForwardDiff(; chunksize = 2))),
+                Shooting(Tsit5(); nlsolve = GaussNewton(),
+                    jac_alg = BVPJacobianAlgorithm(AutoForwardDiff(; chunksize = 2)))])
+    end
+
+    if @load_preference("PrecompileMultipleShootingNLLS", true)
+        append!(algs,
+            [
+                MultipleShooting(10,
+                    Tsit5();
+                    nlsolve = LevenbergMarquardt(; disable_geodesic = Val(true)),
+                    jac_alg = BVPJacobianAlgorithm(;
+                        bc_diffmode = AutoForwardDiff(; chunksize = 2),
+                        nonbc_diffmode = AutoSparseForwardDiff(; chunksize = 2))),
+                MultipleShooting(10,
+                    Tsit5();
+                    nlsolve = GaussNewton(),
+                    jac_alg = BVPJacobianAlgorithm(;
+                        bc_diffmode = AutoForwardDiff(; chunksize = 2),
+                        nonbc_diffmode = AutoSparseForwardDiff(; chunksize = 2)))])
+    end
+
+    @compile_workload begin
+        for prob in probs, alg in algs
+            solve(prob, alg; nlsolve_kwargs = (; abstol = 1e-2))
         end
     end
 end
 
 export Shooting, MultipleShooting
 export MIRK2, MIRK3, MIRK4, MIRK5, MIRK6
+export BVPM2, BVPSOL, COLNEW # From ODEInterface.jl
+
 export MIRKJacobianComputationAlgorithm, BVPJacobianAlgorithm
-# From ODEInterface.jl
-export BVPM2, BVPSOL, COLNEW
 
 end
diff --git a/src/adaptivity.jl b/src/adaptivity.jl
index a1e96133..3665c896 100644
--- a/src/adaptivity.jl
+++ b/src/adaptivity.jl
@@ -27,7 +27,7 @@ end
 Generate new mesh based on the defect.
 """
 @views function mesh_selector!(cache::MIRKCache{iip, T}) where {iip, T}
-    @unpack M, order, defect, mesh, mesh_dt = cache
+    (; M, order, defect, mesh, mesh_dt) = cache
     (_, MxNsub, abstol, _, _), kwargs = __split_mirk_kwargs(; cache.kwargs...)
     N = length(cache.mesh)
 
@@ -83,8 +83,8 @@ end
 
 Generate a new mesh based on the `ŝ`.
 """
-function redistribute!(cache::MIRKCache{iip, T}, Nsub_star, ŝ, mesh,
-        mesh_dt) where {iip, T}
+function redistribute!(
+        cache::MIRKCache{iip, T}, Nsub_star, ŝ, mesh, mesh_dt) where {iip, T}
     N = length(mesh)
     ζ = sum(ŝ .* mesh_dt) / Nsub_star
     k, i = 1, 0
@@ -144,8 +144,8 @@ the RK method in 'k_discrete', plus some new stages in 'k_interp' to construct
 an interpolant
 """
 @views function defect_estimate!(cache::MIRKCache{iip, T}) where {iip, T}
-    @unpack M, stage, f, alg, mesh, mesh_dt, defect = cache
-    @unpack s_star, τ_star = cache.ITU
+    (; M, stage, f, alg, mesh, mesh_dt, defect) = cache
+    (; s_star, τ_star) = cache.ITU
 
     # Evaluate at the first sample point
     w₁, w₁′ = interp_weights(τ_star, alg)
@@ -190,8 +190,8 @@ end
 Here, the ki_interp is the stages in one subinterval.
 """
 @views function interp_setup!(cache::MIRKCache{iip, T}) where {iip, T}
-    @unpack x_star, s_star, c_star, v_star = cache.ITU
-    @unpack k_interp, k_discrete, f, stage, new_stages, y, p, mesh, mesh_dt = cache
+    (; x_star, s_star, c_star, v_star) = cache.ITU
+    (; k_interp, k_discrete, f, stage, new_stages, y, p, mesh, mesh_dt) = cache
 
     for r in 1:(s_star - stage)
         idx₁ = ((1:stage) .- 1) .* (s_star - stage) .+ r
@@ -201,8 +201,8 @@ Here, the ki_interp is the stages in one subinterval.
         end
         if r > 1
             for j in eachindex(k_interp)
-                __maybe_matmul!(new_stages[j], k_interp[j][:, 1:(r - 1)], x_star[idx₂],
-                    T(1), T(1))
+                __maybe_matmul!(
+                    new_stages[j], k_interp[j][:, 1:(r - 1)], x_star[idx₂], T(1), T(1))
             end
         end
         for i in eachindex(new_stages)
@@ -230,43 +230,45 @@ function sum_stages!(cache::MIRKCache, w, w′, i::Int, dt = cache.mesh_dt[i])
 end
 
 function sum_stages!(z::AbstractArray, cache::MIRKCache, w, i::Int, dt = cache.mesh_dt[i])
-    @unpack M, stage, mesh, k_discrete, k_interp, mesh_dt = cache
-    @unpack s_star = cache.ITU
+    (; M, stage, mesh, k_discrete, k_interp, mesh_dt) = cache
+    (; s_star) = cache.ITU
 
     z .= zero(z)
     __maybe_matmul!(z, k_discrete[i].du[:, 1:stage], w[1:stage])
-    __maybe_matmul!(z, k_interp[i][:, 1:(s_star - stage)],
-        w[(stage + 1):s_star], true, true)
+    __maybe_matmul!(
+        z, k_interp[i][:, 1:(s_star - stage)], w[(stage + 1):s_star], true, true)
     z .= z .* dt .+ cache.y₀[i]
 
     return z
 end
 
 @views function sum_stages!(z, z′, cache::MIRKCache, w, w′, i::Int, dt = cache.mesh_dt[i])
-    @unpack M, stage, mesh, k_discrete, k_interp, mesh_dt = cache
-    @unpack s_star = cache.ITU
+    (; M, stage, mesh, k_discrete, k_interp, mesh_dt) = cache
+    (; s_star) = cache.ITU
 
     z .= zero(z)
     __maybe_matmul!(z, k_discrete[i].du[:, 1:stage], w[1:stage])
-    __maybe_matmul!(z, k_interp[i][:, 1:(s_star - stage)],
-        w[(stage + 1):s_star], true, true)
+    __maybe_matmul!(
+        z, k_interp[i][:, 1:(s_star - stage)], w[(stage + 1):s_star], true, true)
     z′ .= zero(z′)
     __maybe_matmul!(z′, k_discrete[i].du[:, 1:stage], w′[1:stage])
-    __maybe_matmul!(z′, k_interp[i][:, 1:(s_star - stage)],
-        w′[(stage + 1):s_star], true, true)
+    __maybe_matmul!(
+        z′, k_interp[i][:, 1:(s_star - stage)], w′[(stage + 1):s_star], true, true)
     z .= z .* dt[1] .+ cache.y₀[i]
 
     return z, z′
 end
 
+"""
+    interp_weights(τ, alg)
+
+interp_weights: solver-specified interpolation weights and its first derivative
+"""
+function interp_weights end
+
 for order in (2, 3, 4, 5, 6)
     alg = Symbol("MIRK$(order)")
     @eval begin
-        """
-            interp_weights(τ, alg)
-
-        interp_weights: solver-specified interpolation weights and its first derivative
-        """
         function interp_weights(τ::T, ::$(alg)) where {T}
             if $(order == 2)
                 w = [0, τ * (1 - τ / 2), τ^2 / 2]
@@ -276,14 +278,12 @@ for order in (2, 3, 4, 5, 6)
                 wp = [0, 1 - τ, τ]
             elseif $(order == 3)
                 w = [τ / 4.0 * (2.0 * τ^2 - 5.0 * τ + 4.0),
-                    -3.0 / 4.0 * τ^2 * (2.0 * τ - 3.0),
-                    τ^2 * (τ - 1.0)]
+                    -3.0 / 4.0 * τ^2 * (2.0 * τ - 3.0), τ^2 * (τ - 1.0)]
 
                 #     Derivative polynomials.
 
                 wp = [3.0 / 2.0 * (τ - 2.0 / 3.0) * (τ - 1.0),
-                    -9.0 / 2.0 * τ * (τ - 1.0),
-                    3.0 * τ * (τ - 2.0 / 3.0)]
+                    -9.0 / 2.0 * τ * (τ - 1.0), 3.0 * τ * (τ - 2.0 / 3.0)]
             elseif $(order == 4)
                 t2 = τ * τ
                 tm1 = τ - 1.0
@@ -297,48 +297,37 @@ for order in (2, 3, 4, 5, 6)
 
                 #   Derivative polynomials
 
-                wp = [-tm1 * t4m3 * t2m1 / 3.0,
-                    τ * t2m1 * t4m3,
-                    4.0 * τ * t4m3 * tm1,
-                    -32.0 * τ * t2m1 * tm1 / 3.0]
+                wp = [-tm1 * t4m3 * t2m1 / 3.0, τ * t2m1 * t4m3,
+                    4.0 * τ * t4m3 * tm1, -32.0 * τ * t2m1 * tm1 / 3.0]
             elseif $(order == 5)
                 w = [
                     τ * (22464.0 - 83910.0 * τ + 143041.0 * τ^2 - 113808.0 * τ^3 +
                      33256.0 * τ^4) / 22464.0,
-                    τ^2 * (-2418.0 + 12303.0 * τ - 19512.0 * τ^2 + 10904.0 * τ^3) /
-                    3360.0,
+                    τ^2 * (-2418.0 + 12303.0 * τ - 19512.0 * τ^2 + 10904.0 * τ^3) / 3360.0,
                     -8 / 81 * τ^2 * (-78.0 + 209.0 * τ - 204.0 * τ^2 + 8.0 * τ^3),
-                    -25 / 1134 * τ^2 *
-                    (-390.0 + 1045.0 * τ - 1020.0 * τ^2 + 328.0 * τ^3),
-                    -25 / 5184 * τ^2 *
-                    (390.0 + 255.0 * τ - 1680.0 * τ^2 + 2072.0 * τ^3),
-                    279841 / 168480 * τ^2 *
-                    (-6.0 + 21.0 * τ - 24.0 * τ^2 + 8.0 * τ^3)]
+                    -25 / 1134 * τ^2 * (-390.0 + 1045.0 * τ - 1020.0 * τ^2 + 328.0 * τ^3),
+                    -25 / 5184 * τ^2 * (390.0 + 255.0 * τ - 1680.0 * τ^2 + 2072.0 * τ^3),
+                    279841 / 168480 * τ^2 * (-6.0 + 21.0 * τ - 24.0 * τ^2 + 8.0 * τ^3)]
 
                 #   Derivative polynomials
 
                 wp = [
-                    1.0 - 13985 // 1872 * τ + 143041 // 7488 * τ^2 -
-                    2371 // 117 * τ^3 +
+                    1.0 - 13985 // 1872 * τ + 143041 // 7488 * τ^2 - 2371 // 117 * τ^3 +
                     20785 // 2808 * τ^4,
                     -403 // 280 * τ + 12303 // 1120 * τ^2 - 813 // 35 * τ^3 +
                     1363 // 84 * τ^4,
-                    416 // 27 * τ - 1672 // 27 * τ^2 + 2176 // 27 * τ^3 -
-                    320 // 81 * τ^4,
+                    416 // 27 * τ - 1672 // 27 * τ^2 + 2176 // 27 * τ^3 - 320 // 81 * τ^4,
                     3250 // 189 * τ - 26125 // 378 * τ^2 + 17000 // 189 * τ^3 -
                     20500 // 567 * τ^4,
                     -1625 // 432 * τ - 2125 // 576 * τ^2 + 875 // 27 * τ^3 -
                     32375 // 648 * τ^4,
-                    -279841 // 14040 * τ + 1958887 // 18720 * τ^2 -
-                    279841 // 1755 * τ^3 +
+                    -279841 // 14040 * τ + 1958887 // 18720 * τ^2 - 279841 // 1755 * τ^3 +
                     279841 // 4212 * τ^4]
             elseif $(order == 6)
                 w = [
                     τ - 28607 // 7434 * τ^2 - 166210 // 33453 * τ^3 +
-                    334780 // 11151 * τ^4 -
-                    1911296 // 55755 * τ^5 + 406528 // 33453 * τ^6,
-                    777 // 590 * τ^2 - 2534158 // 234171 * τ^3 +
-                    2088580 // 78057 * τ^4 -
+                    334780 // 11151 * τ^4 - 1911296 // 55755 * τ^5 + 406528 // 33453 * τ^6,
+                    777 // 590 * τ^2 - 2534158 // 234171 * τ^3 + 2088580 // 78057 * τ^4 -
                     10479104 // 390285 * τ^5 + 11328512 // 1170855 * τ^6,
                     -1008 // 59 * τ^2 + 222176 // 1593 * τ^3 - 180032 // 531 * τ^4 +
                     876544 // 2655 * τ^5 - 180224 // 1593 * τ^6,
@@ -347,22 +336,20 @@ for order in (2, 3, 4, 5, 6)
                     -378 // 59 * τ^2 + 27772 // 531 * τ^3 - 22504 // 177 * τ^4 +
                     109568 // 885 * τ^5 - 22528 // 531 * τ^6,
                     -95232 // 413 * τ^2 + 62384128 // 33453 * τ^3 -
-                    49429504 // 11151 * τ^4 +
-                    46759936 // 11151 * τ^5 - 46661632 // 33453 * τ^6,
-                    896 // 5 * τ^2 - 4352 // 3 * τ^3 + 3456 * τ^4 -
-                    16384 // 5 * τ^5 +
+                    49429504 // 11151 * τ^4 + 46759936 // 11151 * τ^5 -
+                    46661632 // 33453 * τ^6,
+                    896 // 5 * τ^2 - 4352 // 3 * τ^3 + 3456 * τ^4 - 16384 // 5 * τ^5 +
                     16384 // 15 * τ^6,
                     50176 // 531 * τ^2 - 179554304 // 234171 * τ^3 +
-                    143363072 // 78057 * τ^4 -
-                    136675328 // 78057 * τ^5 + 137363456 // 234171 * τ^6,
+                    143363072 // 78057 * τ^4 - 136675328 // 78057 * τ^5 +
+                    137363456 // 234171 * τ^6,
                     16384 // 441 * τ^3 - 16384 // 147 * τ^4 + 16384 // 147 * τ^5 -
                     16384 // 441 * τ^6]
 
                 #     Derivative polynomials.
 
                 wp = [
-                    1 - 28607 // 3717 * τ - 166210 // 11151 * τ^2 +
-                    1339120 // 11151 * τ^3 -
+                    1 - 28607 // 3717 * τ - 166210 // 11151 * τ^2 + 1339120 // 11151 * τ^3 -
                     1911296 // 11151 * τ^4 + 813056 // 11151 * τ^5,
                     777 // 295 * τ - 2534158 // 78057 * τ^2 + 8354320 // 78057 * τ^3 -
                     10479104 // 78057 * τ^4 + 22657024 // 390285 * τ^5,
@@ -373,13 +360,13 @@ for order in (2, 3, 4, 5, 6)
                     -756 // 59 * τ + 27772 // 177 * τ^2 - 90016 // 177 * τ^3 +
                     109568 // 177 * τ^4 - 45056 // 177 * τ^5,
                     -190464 // 413 * τ + 62384128 // 11151 * τ^2 -
-                    197718016 // 11151 * τ^3 +
-                    233799680 // 11151 * τ^4 - 93323264 // 11151 * τ^5,
+                    197718016 // 11151 * τ^3 + 233799680 // 11151 * τ^4 -
+                    93323264 // 11151 * τ^5,
                     1792 // 5 * τ - 4352 * τ^2 + 13824 * τ^3 - 16384 * τ^4 +
                     32768 // 5 * τ^5,
                     100352 // 531 * τ - 179554304 // 78057 * τ^2 +
-                    573452288 // 78057 * τ^3 -
-                    683376640 // 78057 * τ^4 + 274726912 // 78057 * τ^5,
+                    573452288 // 78057 * τ^3 - 683376640 // 78057 * τ^4 +
+                    274726912 // 78057 * τ^5,
                     16384 // 147 * τ^2 - 65536 // 147 * τ^3 + 81920 // 147 * τ^4 -
                     32768 // 147 * τ^5]
             end
@@ -389,7 +376,7 @@ for order in (2, 3, 4, 5, 6)
 end
 
 function sol_eval(cache::MIRKCache{T}, t::T) where {T}
-    @unpack M, mesh, mesh_dt, alg, k_discrete, k_interp, y = cache
+    (; M, mesh, mesh_dt, alg, k_discrete, k_interp, y) = cache
 
     @assert mesh[1] ≤ t ≤ mesh[end]
     i = interval(mesh, t)
diff --git a/src/algorithms.jl b/src/algorithms.jl
index 00cb80f8..4caf2aaf 100644
--- a/src/algorithms.jl
+++ b/src/algorithms.jl
@@ -1,9 +1,30 @@
 # Algorithms
 abstract type BoundaryValueDiffEqAlgorithm <: SciMLBase.AbstractBVPAlgorithm end
+abstract type AbstractShooting <: BoundaryValueDiffEqAlgorithm end
 abstract type AbstractMIRK <: BoundaryValueDiffEqAlgorithm end
 
+## Disable the ugly verbose printing by default
+@inline __modifier_text!(list, fieldname, field) = push!(list, "$fieldname = $(field)")
+@inline __modifier_text!(list, fieldname, ::Nothing) = list
+@inline __modifier_text!(list, fieldname, ::Missing) = list
+@inline function __modifier_text!(list, fieldname, field::SciMLBase.AbstractODEAlgorithm)
+    push!(list, "$fieldname = $(__nameof(field))()")
+end
+
+function Base.show(io::IO, alg::BoundaryValueDiffEqAlgorithm)
+    print(io, "$(__nameof(alg))(")
+    modifiers = String[]
+    for field in fieldnames(typeof(alg))
+        __modifier_text!(modifiers, field, getfield(alg, field))
+    end
+    print(io, join(modifiers, ", "))
+    print(io, ")")
+end
+
 """
-    Shooting(ode_alg = nothing; nlsolve = nothing, jac_alg = BVPJacobianAlgorithm())
+    Shooting(ode_alg; kwargs...)
+    Shooting(ode_alg, nlsolve; kwargs...)
+    Shooting(; ode_alg = nothing, nlsolve = nothing, jac_alg = nothing)
 
 Single shooting method, reduces BVP to an initial value problem and solves the IVP.
 
@@ -12,60 +33,39 @@ Single shooting method, reduces BVP to an initial value problem and solves the I
   - `ode_alg`: ODE algorithm to use for solving the IVP. Any solver which conforms to the
     SciML `ODEProblem` interface can be used! (Defaults to `nothing` which will use
     poly-algorithm if `DifferentialEquations.jl` is loaded else this must be supplied)
-
-## Keyword Arguments
-
   - `nlsolve`: Internal Nonlinear solver. Any solver which conforms to the SciML
     `NonlinearProblem` interface can be used. Note that any autodiff argument for the solver
     will be ignored and a custom jacobian algorithm will be used.
-  - `jac_alg`: Jacobian Algorithm used for the nonlinear solver. Defaults to
-    `BVPJacobianAlgorithm()`, which automatically decides the best algorithm to use based
-    on the input types and problem type. Only `diffmode` is used (defaults to
-    `AutoForwardDiff` if possible else `AutoFiniteDiff`).
-
-!!! note
-    For type-stability, the chunksizes for ForwardDiff ADTypes in `BVPJacobianAlgorithm`
-    must be provided.
+  - `jac_alg`: Jacobian Algorithm used for the Nonlinear Solver. If this is not set, we
+    check if `nlsolve.ad` exists and is not nothing. If it is, we use that to construct
+    the jacobian. If not, we try to use the best algorithm based on the input types
+    and problem type. If `BVPJacobianAlgorithm` is provided, only `diffmode` is used
+    (defaults to `AutoForwardDiff` if possible else `AutoFiniteDiff`).
 """
-struct Shooting{O, N, L <: BVPJacobianAlgorithm} <: BoundaryValueDiffEqAlgorithm
-    ode_alg::O
-    nlsolve::N
-    jac_alg::L
-end
-
-function concretize_jacobian_algorithm(alg::Shooting, prob)
-    jac_alg = alg.jac_alg
-    diffmode = jac_alg.diffmode === nothing ? __default_nonsparse_ad(prob.u0) :
-               jac_alg.diffmode
-    return Shooting(alg.ode_alg, alg.nlsolve, BVPJacobianAlgorithm(diffmode))
+@concrete struct Shooting{J <: BVPJacobianAlgorithm} <: AbstractShooting
+    ode_alg
+    nlsolve
+    jac_alg::J
 end
 
-function Shooting(ode_alg = nothing; nlsolve = nothing, jac_alg = nothing)
-    jac_alg === nothing && (jac_alg = __propagate_nlsolve_ad_to_jac_alg(nlsolve))
-    return Shooting(ode_alg, nlsolve, jac_alg)
+function Shooting(; ode_alg = nothing, nlsolve = nothing, jac_alg = nothing)
+    return Shooting(ode_alg, nlsolve, __materialize_jacobian_algorithm(nlsolve, jac_alg))
 end
+@inline Shooting(ode_alg; kwargs...) = Shooting(; ode_alg, kwargs...)
+@inline Shooting(ode_alg, nlsolve; kwargs...) = Shooting(; ode_alg, nlsolve, kwargs...)
 
-Shooting(ode_alg, nlsolve; jac_alg = nothing) = Shooting(ode_alg; nlsolve, jac_alg)
-
-# This is a deprecation path. We forward the `ad` from nonlinear solver to `jac_alg`.
-# We will drop this function in
-function __propagate_nlsolve_ad_to_jac_alg(nlsolve::N) where {N}
-    # Defaults so no depwarn
-    nlsolve === nothing && return BVPJacobianAlgorithm()
-    ad = hasfield(N, :ad) ? nlsolve.ad : nothing
-    ad === nothing && return BVPJacobianAlgorithm()
-
-    Base.depwarn(
-        "Setting autodiff to the nonlinear solver in Shooting has been deprecated \
-         and will have no effect from the next major release. Update to use \
-         `BVPJacobianAlgorithm` directly",
-        :Shooting)
-    return BVPJacobianAlgorithm(ad)
+@inline function concretize_jacobian_algorithm(alg::Shooting, prob)
+    alg.jac_alg.diffmode === nothing &&
+        (return @set alg.jac_alg.diffmode = __default_nonsparse_ad(prob.u0))
+    return alg
 end
 
 """
-    MultipleShooting(nshoots::Int, ode_alg = nothing; nlsolve = nothing,
-        grid_coarsening = true, jac_alg = BVPJacobianAlgorithm())
+    MultipleShooting(; nshoots::Int, ode_alg = nothing, nlsolve = nothing,
+        grid_coarsening = true, jac_alg = nothing)
+    MultipleShooting(nshoots::Int; kwargs...)
+    MultipleShooting(nshoots::Int, ode_alg; kwargs...)
+    MultipleShooting(nshoots::Int, ode_alg, nlsolve; kwargs...)
 
 Multiple Shooting method, reduces BVP to an initial value problem and solves the IVP.
 Significantly more stable than Single Shooting.
@@ -73,40 +73,35 @@ Significantly more stable than Single Shooting.
 ## Arguments
 
   - `nshoots`: Number of shooting points.
+
   - `ode_alg`: ODE algorithm to use for solving the IVP. Any solver which conforms to the
     SciML `ODEProblem` interface can be used! (Defaults to `nothing` which will use
     poly-algorithm if `DifferentialEquations.jl` is loaded else this must be supplied)
-
-## Keyword Arguments
-
   - `nlsolve`: Internal Nonlinear solver. Any solver which conforms to the SciML
-    `NonlinearProblem` interface can be used. Note that any autodiff argument for the solver
-    will be ignored and a custom jacobian algorithm will be used.
+    `NonlinearProblem` interface can be used.
   - `jac_alg`: Jacobian Algorithm used for the nonlinear solver. Defaults to
     `BVPJacobianAlgorithm()`, which automatically decides the best algorithm to use based
     on the input types and problem type.
-    - For `TwoPointBVProblem`, only `diffmode` is used (defaults to
-      `AutoSparseForwardDiff` if possible else `AutoSparseFiniteDiff`).
-    - For `BVProblem`, `bc_diffmode` and `nonbc_diffmode` are used. For `nonbc_diffmode`
-      defaults to `AutoSparseForwardDiff` if possible else `AutoSparseFiniteDiff`. For
-      `bc_diffmode`, defaults to `AutoForwardDiff` if possible else `AutoFiniteDiff`.
+
+      + For `TwoPointBVProblem`, only `diffmode` is used (defaults to
+        `AutoSparseForwardDiff` if possible else `AutoSparseFiniteDiff`).
+      + For `BVProblem`, `bc_diffmode` and `nonbc_diffmode` are used. For `nonbc_diffmode`
+        we default to `AutoSparseForwardDiff` if possible else `AutoSparseFiniteDiff`. For
+        `bc_diffmode`, we default to `AutoForwardDiff` if possible else `AutoFiniteDiff`.
   - `grid_coarsening`: Coarsening the multiple-shooting grid to generate a stable IVP
     solution. Possible Choices:
-    - `true`: Halve the grid size, till we reach a grid size of 1.
-    - `false`: Do not coarsen the grid. Solve a Multiple Shooting Problem and finally
-      solve a Single Shooting Problem.
-    - `AbstractVector{<:Int}` or `Ntuple{N, <:Integer}`: Use the provided grid coarsening.
-      For example, if `nshoots = 10` and `grid_coarsening = [5, 2]`, then the grid will be
-      coarsened to `[5, 2]`. Note that `1` should not be present in the grid coarsening.
-    - `Function`: Takes the current number of shooting points and returns the next number
-      of shooting points. For example, if `nshoots = 10` and
-      `grid_coarsening = n -> n ÷ 2`, then the grid will be coarsened to `[5, 2]`.
 
-!!! note
-    For type-stability, the chunksizes for ForwardDiff ADTypes in `BVPJacobianAlgorithm`
-    must be provided.
+      + `true`: Halve the grid size, till we reach a grid size of 1.
+      + `false`: Do not coarsen the grid. Solve a Multiple Shooting Problem and finally
+        solve a Single Shooting Problem.
+      + `AbstractVector{<:Int}` or `Ntuple{N, <:Integer}`: Use the provided grid coarsening.
+        For example, if `nshoots = 10` and `grid_coarsening = [5, 2]`, then the grid will be
+        coarsened to `[5, 2]`. Note that `1` should not be present in the grid coarsening.
+      + `Function`: Takes the current number of shooting points and returns the next number
+        of shooting points. For example, if `nshoots = 10` and
+        `grid_coarsening = n -> n ÷ 2`, then the grid will be coarsened to `[5, 2]`.
 """
-@concrete struct MultipleShooting{J <: BVPJacobianAlgorithm}
+@concrete struct MultipleShooting{J <: BVPJacobianAlgorithm} <: AbstractShooting
     ode_alg
     nlsolve
     jac_alg::J
@@ -116,34 +111,43 @@ end
 
 function concretize_jacobian_algorithm(alg::MultipleShooting, prob)
     jac_alg = concrete_jacobian_algorithm(alg.jac_alg, prob, alg)
-    return MultipleShooting(alg.ode_alg, alg.nlsolve, jac_alg, alg.nshoots,
-        alg.grid_coarsening)
+    return MultipleShooting(
+        alg.ode_alg, alg.nlsolve, jac_alg, alg.nshoots, alg.grid_coarsening)
 end
 
 function update_nshoots(alg::MultipleShooting, nshoots::Int)
-    return MultipleShooting(alg.ode_alg, alg.nlsolve, alg.jac_alg, nshoots,
-        alg.grid_coarsening)
+    return MultipleShooting(
+        alg.ode_alg, alg.nlsolve, alg.jac_alg, nshoots, alg.grid_coarsening)
 end
 
-function MultipleShooting(nshoots::Int, ode_alg = nothing; nlsolve = nothing,
-        grid_coarsening = true, jac_alg = BVPJacobianAlgorithm())
-    @assert grid_coarsening isa Bool || grid_coarsening isa Function ||
-            grid_coarsening isa AbstractVector{<:Integer} ||
-            grid_coarsening isa NTuple{N, <:Integer} where {N}
+function MultipleShooting(; nshoots::Int,
+        ode_alg = nothing,
+        nlsolve = nothing,
+        grid_coarsening::Union{
+            Bool, Function, <:AbstractVector{<:Integer}, Tuple{Vararg{Integer}}} = true,
+        jac_alg = nothing)
     grid_coarsening isa Tuple && (grid_coarsening = Vector(grid_coarsening...))
     if grid_coarsening isa AbstractVector
         sort!(grid_coarsening; rev = true)
         @assert all(grid_coarsening .> 0) && 1 ∉ grid_coarsening
     end
-    return MultipleShooting(ode_alg, nlsolve, jac_alg, nshoots, grid_coarsening)
+    return MultipleShooting(
+        ode_alg, nlsolve, __materialize_jacobian_algorithm(nlsolve, jac_alg),
+        nshoots, grid_coarsening)
 end
+@inline MultipleShooting(nshoots::Int; kwargs...) = MultipleShooting(; nshoots, kwargs...)
+@inline MultipleShooting(nshoots::Int, ode_alg; kwargs...) = MultipleShooting(;
+    nshoots, ode_alg, kwargs...)
+@inline MultipleShooting(nshoots::Int, ode_alg, nlsolve; kwargs...) = MultipleShooting(;
+    nshoots, ode_alg, nlsolve, kwargs...)
 
 for order in (2, 3, 4, 5, 6)
     alg = Symbol("MIRK$(order)")
 
     @eval begin
         """
-            $($alg)(; nlsolve = NewtonRaphson(), jac_alg = BVPJacobianAlgorithm())
+            $($alg)(; nlsolve = NewtonRaphson(), jac_alg = BVPJacobianAlgorithm(),
+                    defect_threshold = 0.1, max_num_subintervals = 3000)
 
         $($order)th order Monotonic Implicit Runge Kutta method.
 
@@ -161,6 +165,8 @@ for order in (2, 3, 4, 5, 6)
               `nonbc_diffmode` defaults to `AutoSparseForwardDiff` if possible else
               `AutoSparseFiniteDiff`. For `bc_diffmode`, defaults to `AutoForwardDiff` if
               possible else `AutoFiniteDiff`.
+          - `defect_threshold`: Threshold for defect control.
+          - `max_num_subintervals`: Number of maximal subintervals, default as 3000.
 
         !!! note
             For type-stability, the chunksizes for ForwardDiff ADTypes in
@@ -168,6 +174,7 @@ for order in (2, 3, 4, 5, 6)
 
         ## References
 
+        ```bibtex
         @article{Enright1996RungeKuttaSW,
             title={Runge-Kutta Software with Defect Control for Boundary Value ODEs},
             author={Wayne H. Enright and Paul H. Muir},
@@ -176,10 +183,13 @@ for order in (2, 3, 4, 5, 6)
             volume={17},
             pages={479-497}
         }
+        ```
         """
-        Base.@kwdef struct $(alg){N, J <: BVPJacobianAlgorithm} <: AbstractMIRK
+        @kwdef struct $(alg){N, J <: BVPJacobianAlgorithm, T} <: AbstractMIRK
             nlsolve::N = nothing
             jac_alg::J = BVPJacobianAlgorithm()
+            defect_threshold::T = 0.1
+            max_num_subintervals::Int = 3000
         end
     end
 end
@@ -196,42 +206,58 @@ Fortran code for solving two-point boundary value problems. For detailed documen
 ## Keyword Arguments:
 
     - `max_num_subintervals`: Number of maximal subintervals, default as 3000.
-    - `method_choice`: Choice for IVP-solvers, default as Runge-Kutta method of order 4, available choices:
+    - `method_choice`: Choice for IVP-solvers, default as Runge-Kutta method of order 4,
+      available choices:
         - `2`: Runge-Kutta method of order 2.
         - `4`: Runge-Kutta method of order 4.
         - `6`: Runge-Kutta method of order 6.
-    - `diagnostic_output`: Diagnostic output for BVPM2, default as non printout, available choices:
+    - `diagnostic_output`: Diagnostic output for BVPM2, default as non printout, available
+      choices:
         - `-1`: Full diagnostic printout.
         - `0`: Selected printout.
         - `1`: No printout.
-    - `error_control`: Determines the error-estimation for which RTOL is used, default as defect control, available choices:
+    - `error_control`: Determines the error-estimation for which RTOL is used, default as
+      defect control, available choices:
         - `1`: Defect control.
         - `2`: Global error control.
         - `3`: Defect and then global error control.
         - `4`: Linear combination of defect and global error control.
-    - `singular_term`: either nothing if the ODEs have no singular terms at the left boundary or a constant (d,d) matrix for the
+    - `singular_term`: either nothing if the ODEs have no singular terms at the left
+      boundary or a constant (d,d) matrix for the
         singular term.
 
-!!! warning
-    Only supports inplace two-point boundary value problems, with very limited forms of
-    input structures!
-
 !!! note
+
     Only available if the `ODEInterface` package is loaded.
 """
-Base.@kwdef struct BVPM2{S} <: BoundaryValueDiffEqAlgorithm
-    max_num_subintervals::Int = 3000
-    method_choice::Int = 4
-    diagnostic_output::Int = -1
-    error_control::Int = 1
-    singular_term::S = nothing
+struct BVPM2{S} <: BoundaryValueDiffEqAlgorithm
+    max_num_subintervals::Int
+    method_choice::Int
+    diagnostic_output::Int
+    error_control::Int
+    singular_term::S
+
+    function BVPM2(max_num_subintervals::Int, method_choice::Int, diagnostic_output::Int,
+            error_control::Int, singular_term::Union{Nothing, AbstractMatrix})
+        if Base.get_extension(@__MODULE__, :BoundaryValueDiffEqODEInterfaceExt) === nothing
+            error("`BVPM2` requires `ODEInterface.jl` to be loaded")
+        end
+        return new{typeof(singular_term)}(max_num_subintervals, method_choice,
+            diagnostic_output, error_control, singular_term)
+    end
+end
+
+function BVPM2(; max_num_subintervals::Int = 3000, method_choice::Int = 4,
+        diagnostic_output::Int = -1, error_control::Int = 1, singular_term = nothing)
+    return BVPM2(max_num_subintervals, method_choice,
+        diagnostic_output, error_control, singular_term)
 end
 
 """
     BVPSOL(; bvpclass = 2, sol_method = 0, odesolver = nothing)
     BVPSOL(bvpclass::Int, sol_methods::Int, odesolver)
 
-A FORTRAN77 code which solves highly nonlinear two point boundary value problems using a
+A FORTRAN77 code which solves highly nonlinear **two point boundary value problems** using a
 local linear solver (condensing algorithm) or a global sparse linear solver for the solution
 of the arising linear subproblems, by Peter Deuflhard, Georg Bader, Lutz Weimann.
 For detailed documentation, see
@@ -239,64 +265,95 @@ For detailed documentation, see
 
 ## Keyword Arguments
 
-    - `bvpclass`: Boundary value problem classification, default as highly nonlinear with bad initial data, available choices:
+    - `bvpclass`: Boundary value problem classification, default as highly nonlinear with
+      bad initial data, available choices:
         - `0`: Linear boundary value problem.
         - `1`: Nonlinear with good initial data.
         - `2`: Highly Nonlinear with bad initial data.
-        - `3`: Highly nonlinear with bad initial data and initial rank reduction to seperable
-            linear boundary conditions.
-    - `sol_method`: Switch for solution methods, default as local linear solver with condensing algorithm, available choices:
+        - `3`: Highly nonlinear with bad initial data and initial rank reduction to
+          seperable linear boundary conditions.
+    - `sol_method`: Switch for solution methods, default as local linear solver with
+      condensing algorithm, available choices:
         - `0`: Use local linear solver with condensing algorithm.
         - `1`: Use global sparse linear solver.
     - `odesolver`: Either `nothing` or ode-solver(dopri5, dop853, seulex, etc.).
 
-!!! warning
-    Only supports inplace two-point boundary value problems, with very limited forms of
-    input structures!
-
 !!! note
+
     Only available if the `ODEInterface` package is loaded.
 """
-Base.@kwdef struct BVPSOL{O} <: BoundaryValueDiffEqAlgorithm
-    bvpclass::Int = 2
-    sol_method::Int = 0
-    odesolver::O = nothing
+struct BVPSOL{O} <: BoundaryValueDiffEqAlgorithm
+    bvpclass::Int
+    sol_method::Int
+    odesolver::O
+
+    function BVPSOL(bvpclass::Int, sol_method::Int, odesolver)
+        if Base.get_extension(@__MODULE__, :BoundaryValueDiffEqODEInterfaceExt) === nothing
+            error("`BVPSOL` requires `ODEInterface.jl` to be loaded")
+        end
+        return new{typeof(odesolver)}(bvpclass, sol_method, odesolver)
+    end
+end
+
+function BVPSOL(; bvpclass = 2, sol_method = 0, odesolver = nothing)
+    return BVPSOL(bvpclass, sol_method, odesolver)
 end
 
 """
-    COLNEW(; bvpclass = 2, collocationpts = 7, autodiff = :central)
-    COLNEW(bvpclass::Int, collocationpts::Int, autodiff)
+    COLNEW(; bvpclass = 1, collocationpts = 7, diagnostic_output = 1,
+        max_num_subintervals = 3000)
+    COLNEW(bvpclass::Int, collocationpts::Int, diagnostic_output::Int,
+        max_num_subintervals::Int)
 
 ## Keyword Arguments:
 
-    - `bvpclass`: Boundary value problem classification, default as nonlinear and "extra sensitive", available choices:
-        - `0`: Linear boundary value problem.
-        - `1`: Nonlinear and regular.
-        - `2`: Nonlinear and "extra sensitive" (first relax factor is rstart and the
-            nonlinear iteration does not rely on past convergence).
-        - `3`: fail-early: return immediately upon:
-            (a) two successive non-convergences.
-            (b) after obtaining an error estimate for the first time.
-    - `collocationpts`: Number of collocation points per subinterval. Require orders[i] ≤ k ≤ 7, default as 7
-    - `diagnostic_output`: Diagnostic output for COLNEW, default as no printout, available choices:
-        - `-1`: Full diagnostic printout.
-        - `0`: Selected printout.
-        - `1`: No printout.
-    - `max_num_subintervals`: Number of maximal subintervals, default as 3000.
+  - `bvpclass`: Boundary value problem classification, default as nonlinear and
+    "extra sensitive", available choices:
+
+      + `0`: Linear boundary value problem.
+      + `1`: Nonlinear and regular.
+      + `2`: Nonlinear and "extra sensitive" (first relax factor is rstart and the
+        nonlinear iteration does not rely on past convergence).
+      + `3`: fail-early: return immediately upon: a) two successive non-convergences.
+        b) after obtaining an error estimate for the first time.
+
+  - `collocationpts`: Number of collocation points per subinterval. Require
+    orders[i] ≤ k ≤ 7, default as 7
+  - `diagnostic_output`: Diagnostic output for COLNEW, default as no printout, available
+    choices:
 
-A Fortran77 code solves a multi-points boundary value problems for a mixed order system of ODEs.
-It incorporates a new basis representation replacing b-splines, and improvements for
+      + `-1`: Full diagnostic printout.
+      + `0`: Selected printout.
+      + `1`: No printout.
+  - `max_num_subintervals`: Number of maximal subintervals, default as 3000.
+
+A Fortran77 code solves a multi-points boundary value problems for a mixed order system of
+ODEs. It incorporates a new basis representation replacing b-splines, and improvements for
 the linear and nonlinear algebraic equation solvers.
 
 !!! warning
+
     Only supports two-point boundary value problems.
 
 !!! note
+
     Only available if the `ODEInterface` package is loaded.
 """
-Base.@kwdef struct COLNEW <: BoundaryValueDiffEqAlgorithm
-    bvpclass::Int = 1
-    collocationpts::Int = 7
-    diagnostic_output::Int = 1
-    max_num_subintervals::Int = 3000
+struct COLNEW <: BoundaryValueDiffEqAlgorithm
+    bvpclass::Int
+    collocationpts::Int
+    diagnostic_output::Int
+    max_num_subintervals::Int
+
+    function COLNEW(; bvpclass::Int = 1, collocationpts::Int = 7,
+            diagnostic_output::Int = 1, max_num_subintervals::Int = 3000)
+        return COLNEW(bvpclass, collocationpts, diagnostic_output, max_num_subintervals)
+    end
+    function COLNEW(bvpclass::Int, collocationpts::Int,
+            diagnostic_output::Int, max_num_subintervals::Int)
+        if Base.get_extension(@__MODULE__, :BoundaryValueDiffEqODEInterfaceExt) === nothing
+            error("`COLNEW` requires `ODEInterface.jl` to be loaded")
+        end
+        return new(bvpclass, collocationpts, diagnostic_output, max_num_subintervals)
+    end
 end
diff --git a/src/collocation.jl b/src/collocation.jl
index 879a9b38..2ac85538 100644
--- a/src/collocation.jl
+++ b/src/collocation.jl
@@ -1,11 +1,11 @@
 function Φ!(residual, cache::MIRKCache, y, u, p = cache.p)
-    return Φ!(residual, cache.fᵢ_cache, cache.k_discrete, cache.f, cache.TU,
-        y, u, p, cache.mesh, cache.mesh_dt, cache.stage)
+    return Φ!(residual, cache.fᵢ_cache, cache.k_discrete, cache.f,
+        cache.TU, y, u, p, cache.mesh, cache.mesh_dt, cache.stage)
 end
 
-@views function Φ!(residual, fᵢ_cache, k_discrete, f!, TU::MIRKTableau, y, u, p,
-        mesh, mesh_dt, stage::Int)
-    @unpack c, v, x, b = TU
+@views function Φ!(residual, fᵢ_cache, k_discrete, f!, TU::MIRKTableau,
+        y, u, p, mesh, mesh_dt, stage::Int)
+    (; c, v, x, b) = TU
 
     tmp = get_tmp(fᵢ_cache, u)
     T = eltype(u)
@@ -30,13 +30,13 @@ end
 end
 
 function Φ(cache::MIRKCache, y, u, p = cache.p)
-    return Φ(cache.fᵢ_cache, cache.k_discrete, cache.f, cache.TU, y, u, p, cache.mesh,
-        cache.mesh_dt, cache.stage)
+    return Φ(cache.fᵢ_cache, cache.k_discrete, cache.f, cache.TU,
+        y, u, p, cache.mesh, cache.mesh_dt, cache.stage)
 end
 
-@views function Φ(fᵢ_cache, k_discrete, f, TU::MIRKTableau, y, u, p, mesh, mesh_dt,
-        stage::Int)
-    @unpack c, v, x, b = TU
+@views function Φ(
+        fᵢ_cache, k_discrete, f, TU::MIRKTableau, y, u, p, mesh, mesh_dt, stage::Int)
+    (; c, v, x, b) = TU
     residuals = [similar(yᵢ) for yᵢ in y[1:(end - 1)]]
     tmp = get_tmp(fᵢ_cache, u)
     T = eltype(u)
diff --git a/src/default_nlsolve.jl b/src/default_nlsolve.jl
new file mode 100644
index 00000000..baa2b8de
--- /dev/null
+++ b/src/default_nlsolve.jl
@@ -0,0 +1,49 @@
+# Currently there are some problems with the default NonlinearSolver selection for
+# BoundaryValueDiffEq
+# See https://github.com/SciML/BoundaryValueDiffEq.jl/issues/175
+# and https://github.com/SciML/BoundaryValueDiffEq.jl/issues/163
+# These are not meant to be user facing and we should delete these once those issues are
+# resolved
+function __FastShortcutBVPCompatibleNLLSPolyalg(
+        ::Type{T} = Float64; concrete_jac = nothing, linsolve = nothing,
+        precs = NonlinearSolve.DEFAULT_PRECS, autodiff = nothing, kwargs...) where {T}
+    if NonlinearSolve.__is_complex(T)
+        algs = (GaussNewton(; concrete_jac, linsolve, precs, autodiff, kwargs...),
+            LevenbergMarquardt(;
+                linsolve, precs, autodiff, disable_geodesic = Val(true), kwargs...),
+            LevenbergMarquardt(; linsolve, precs, autodiff, kwargs...))
+    else
+        algs = (GaussNewton(; concrete_jac, linsolve, precs, autodiff, kwargs...),
+            LevenbergMarquardt(;
+                linsolve, precs, disable_geodesic = Val(true), autodiff, kwargs...),
+            TrustRegion(; concrete_jac, linsolve, precs, autodiff, kwargs...),
+            GaussNewton(; concrete_jac, linsolve, precs,
+                linesearch = LineSearchesJL(; method = BackTracking()),
+                autodiff, kwargs...),
+            LevenbergMarquardt(; linsolve, precs, autodiff, kwargs...))
+    end
+    return NonlinearSolvePolyAlgorithm(algs, Val(:NLLS))
+end
+
+function __FastShortcutBVPCompatibleNonlinearPolyalg(
+        ::Type{T} = Float64; concrete_jac = nothing, linsolve = nothing,
+        precs = NonlinearSolve.DEFAULT_PRECS, autodiff = nothing) where {T}
+    if NonlinearSolve.__is_complex(T)
+        algs = (NewtonRaphson(; concrete_jac, linsolve, precs, autodiff),)
+    else
+        algs = (NewtonRaphson(; concrete_jac, linsolve, precs, autodiff),
+            NewtonRaphson(; concrete_jac, linsolve, precs,
+                linesearch = LineSearchesJL(; method = BackTracking()), autodiff),
+            TrustRegion(; concrete_jac, linsolve, precs, autodiff))
+    end
+    return NonlinearSolvePolyAlgorithm(algs, Val(:NLS))
+end
+
+@inline __concrete_nonlinearsolve_algorithm(prob, alg) = alg
+@inline function __concrete_nonlinearsolve_algorithm(prob, ::Nothing)
+    if prob isa NonlinearLeastSquaresProblem
+        return __FastShortcutBVPCompatibleNLLSPolyalg(eltype(prob.u0))
+    else
+        return __FastShortcutBVPCompatibleNonlinearPolyalg(eltype(prob.u0))
+    end
+end
diff --git a/src/interpolation.jl b/src/interpolation.jl
index 49e284db..d94b13f5 100644
--- a/src/interpolation.jl
+++ b/src/interpolation.jl
@@ -1,6 +1,6 @@
-struct MIRKInterpolation{T1, T2} <: AbstractDiffEqInterpolation
-    t::T1
-    u::T2
+@concrete struct MIRKInterpolation <: AbstractDiffEqInterpolation
+    t
+    u
     cache
 end
 
@@ -9,18 +9,19 @@ function DiffEqBase.interp_summary(interp::MIRKInterpolation)
 end
 
 function (id::MIRKInterpolation)(tvals, idxs, deriv, p, continuity::Symbol = :left)
-    interpolation(tvals, id, idxs, deriv, p, continuity)
+    return interpolation(tvals, id, idxs, deriv, p, continuity)
 end
 
 function (id::MIRKInterpolation)(val, tvals, idxs, deriv, p, continuity::Symbol = :left)
     interpolation!(val, tvals, id, idxs, deriv, p, continuity)
+    return
 end
 
 # FIXME: Fix the interpolation outside the tspan
 
-@inline function interpolation(tvals, id::I, idxs, deriv::D, p,
-        continuity::Symbol = :left) where {I, D}
-    @unpack t, u, cache = id
+@inline function interpolation(
+        tvals, id::I, idxs, deriv::D, p, continuity::Symbol = :left) where {I, D}
+    (; t, u, cache) = id
     tdir = sign(t[end] - t[1])
     idx = sortperm(tvals, rev = tdir < 0)
 
@@ -40,9 +41,9 @@ end
     return DiffEqArray(vals, tvals)
 end
 
-@inline function interpolation!(vals, tvals, id::I, idxs, deriv::D, p,
-        continuity::Symbol = :left) where {I, D}
-    @unpack t, cache = id
+@inline function interpolation!(
+        vals, tvals, id::I, idxs, deriv::D, p, continuity::Symbol = :left) where {I, D}
+    (; t, cache) = id
     tdir = sign(t[end] - t[1])
     idx = sortperm(tvals, rev = tdir < 0)
 
@@ -53,8 +54,8 @@ end
     end
 end
 
-@inline function interpolation(tval::Number, id::I, idxs, deriv::D, p,
-        continuity::Symbol = :left) where {I, D}
+@inline function interpolation(
+        tval::Number, id::I, idxs, deriv::D, p, continuity::Symbol = :left) where {I, D}
     z = similar(id.cache.fᵢ₂_cache)
     interp_eval!(z, id.cache, tval, id.cache.mesh, id.cache.mesh_dt)
     return z
diff --git a/src/solve/mirk.jl b/src/solve/mirk.jl
index 37721222..8d82633e 100644
--- a/src/solve/mirk.jl
+++ b/src/solve/mirk.jl
@@ -36,31 +36,31 @@ function SciMLBase.__init(prob::BVProblem, alg::AbstractMIRK; dt = 0.0,
     @set! alg.jac_alg = concrete_jacobian_algorithm(alg.jac_alg, prob, alg)
     iip = isinplace(prob)
 
-    _, T, M, n, X = __extract_problem_details(prob; dt, check_positive_dt = true)
-    # NOTE: Assumes the user provided initial guess is on a uniform mesh
-    mesh = collect(range(prob.tspan[1], stop = prob.tspan[2], length = n + 1))
+    t₀, t₁ = prob.tspan
+    ig, T, N, Nig, X = __extract_problem_details(prob; dt, check_positive_dt = true)
+    mesh = __extract_mesh(prob.u0, t₀, t₁, Nig)
     mesh_dt = diff(mesh)
 
-    chunksize = pickchunksize(M * (n + 1))
-
-    __alloc = x -> __maybe_allocate_diffcache(vec(x), chunksize, alg.jac_alg)
+    chunksize = pickchunksize(N * (Nig - 1))
+    __alloc = @closure x -> __maybe_allocate_diffcache(vec(x), chunksize, alg.jac_alg)
 
     fᵢ_cache = __alloc(similar(X))
     fᵢ₂_cache = vec(similar(X))
 
-    defect_threshold = T(0.1)  # TODO: Allow user to specify these
-    MxNsub = 3000              # TODO: Allow user to specify these
+    defect_threshold = T(alg.defect_threshold)
+    MxNsub = alg.max_num_subintervals
 
     # Don't flatten this here, since we need to expand it later if needed
-    y₀ = __initial_state_from_prob(prob, mesh)
+    y₀ = __initial_guess_on_mesh(prob.u0, mesh, prob.p, false)
+
     y = __alloc.(copy.(y₀))
     TU, ITU = constructMIRK(alg, T)
     stage = alg_stage(alg)
 
-    k_discrete = [__maybe_allocate_diffcache(similar(X, M, stage), chunksize, alg.jac_alg)
-                  for _ in 1:n]
-    k_interp = [similar(X, ifelse(adaptive, M, 0), ifelse(adaptive, ITU.s_star - stage, 0))
-                for _ in 1:n]
+    k_discrete = [__maybe_allocate_diffcache(similar(X, N, stage), chunksize, alg.jac_alg)
+                  for _ in 1:Nig]
+    k_interp = [similar(X, ifelse(adaptive, N, 0), ifelse(adaptive, ITU.s_star - stage, 0))
+                for _ in 1:Nig]
 
     bcresid_prototype, resid₁_size = __get_bcresid_prototype(prob.problem_type, prob, X)
 
@@ -74,38 +74,42 @@ function SciMLBase.__init(prob::BVProblem, alg::AbstractMIRK; dt = 0.0,
         nothing
     end
 
-    defect = [similar(X, ifelse(adaptive, M, 0)) for _ in 1:n]
-    new_stages = [similar(X, ifelse(adaptive, M, 0)) for _ in 1:n]
+    defect = [similar(X, ifelse(adaptive, N, 0)) for _ in 1:Nig]
+    new_stages = [similar(X, ifelse(adaptive, N, 0)) for _ in 1:Nig]
 
     # Transform the functions to handle non-vector inputs
     bcresid_prototype = __vec(bcresid_prototype)
     f, bc = if X isa AbstractVector
         prob.f, prob.f.bc
     elseif iip
-        vecf! = (du, u, p, t) -> __vec_f!(du, u, p, t, prob.f, size(X))
+        vecf! = @closure (du, u, p, t) -> __vec_f!(du, u, p, t, prob.f, size(X))
         vecbc! = if !(prob.problem_type isa TwoPointBVProblem)
-            (r, u, p, t) -> __vec_bc!(r, u, p, t, prob.f.bc, resid₁_size, size(X))
+            @closure (r, u, p, t) -> __vec_bc!(r, u, p, t, prob.f.bc, resid₁_size, size(X))
         else
-            ((r, u, p) -> __vec_bc!(r, u, p, prob.f.bc[1], resid₁_size[1], size(X)),
-                (r, u, p) -> __vec_bc!(r, u, p, prob.f.bc[2], resid₁_size[2], size(X)))
+            (
+                @closure((r, u, p)->__vec_bc!(
+                    r, u, p, first(prob.f.bc), resid₁_size[1], size(X))),
+                @closure((r, u, p)->__vec_bc!(
+                    r, u, p, last(prob.f.bc), resid₁_size[2], size(X))))
         end
         vecf!, vecbc!
     else
-        vecf = (u, p, t) -> __vec_f(u, p, t, prob.f, size(X))
+        vecf = @closure (u, p, t) -> __vec_f(u, p, t, prob.f, size(X))
         vecbc = if !(prob.problem_type isa TwoPointBVProblem)
-            (u, p, t) -> __vec_bc(u, p, t, prob.f.bc, size(X))
+            @closure (u, p, t) -> __vec_bc(u, p, t, prob.f.bc, size(X))
         else
-            ((u, p) -> __vec_bc(u, p, prob.f.bc[1], size(X))),
-            (u, p) -> __vec_bc(u, p, prob.f.bc[2], size(X))
+            (@closure((u, p)->__vec_bc(u, p, first(prob.f.bc), size(X))),
+                @closure((u, p)->__vec_bc(u, p, last(prob.f.bc), size(X))))
         end
         vecf, vecbc
     end
 
     prob_ = !(prob.u0 isa AbstractArray) ? remake(prob; u0 = X) : prob
 
-    return MIRKCache{iip, T}(alg_order(alg), stage, M, size(X), f, bc, prob_,
-        prob.problem_type, prob.p, alg, TU, ITU, bcresid_prototype, mesh, mesh_dt,
-        k_discrete, k_interp, y, y₀, residual, fᵢ_cache, fᵢ₂_cache, defect, new_stages,
+    return MIRKCache{iip, T}(
+        alg_order(alg), stage, N, size(X), f, bc, prob_, prob.problem_type,
+        prob.p, alg, TU, ITU, bcresid_prototype, mesh, mesh_dt, k_discrete,
+        k_interp, y, y₀, residual, fᵢ_cache, fᵢ₂_cache, defect, new_stages,
         resid₁_size, (; defect_threshold, MxNsub, abstol, dt, adaptive, kwargs...))
 end
 
@@ -127,64 +131,82 @@ function __expand_cache!(cache::MIRKCache)
     return cache
 end
 
-function __split_mirk_kwargs(; defect_threshold, MxNsub, abstol, dt, adaptive = true,
-        kwargs...)
-    return ((defect_threshold, MxNsub, abstol, adaptive, dt),
-        (; abstol, adaptive, kwargs...))
+function __split_mirk_kwargs(;
+        defect_threshold, MxNsub, abstol, dt, adaptive = true, kwargs...)
+    return (
+        (defect_threshold, MxNsub, abstol, adaptive, dt), (; abstol, adaptive, kwargs...))
 end
 
 function SciMLBase.solve!(cache::MIRKCache)
-    (defect_threshold, MxNsub, abstol, adaptive, _), kwargs = __split_mirk_kwargs(;
-        cache.kwargs...)
-    @unpack y, y₀, prob, alg, mesh, mesh_dt, TU, ITU = cache
+    (_, _, abstol, adaptive, _), kwargs = __split_mirk_kwargs(; cache.kwargs...)
     info::ReturnCode.T = ReturnCode.Success
-    defect_norm = 2 * abstol
 
-    while SciMLBase.successful_retcode(info) && defect_norm > abstol
-        nlprob = __construct_nlproblem(cache, recursive_flatten(y₀))
-        sol_nlprob = __solve(nlprob, alg.nlsolve; abstol, kwargs...)
-        recursive_unflatten!(cache.y₀, sol_nlprob.u)
+    # We do the first iteration outside the loop to preserve type-stability of the
+    # `original` field of the solution
+    sol_nlprob, info, defect_norm = __perform_mirk_iteration(
+        cache, abstol, adaptive; kwargs...)
 
-        info = sol_nlprob.retcode
+    if adaptive
+        while SciMLBase.successful_retcode(info) && defect_norm > abstol
+            sol_nlprob, info, defect_norm = __perform_mirk_iteration(
+                cache, abstol, adaptive; kwargs...)
+        end
+    end
 
-        !adaptive && break
+    u = [reshape(y, cache.in_size) for y in cache.y₀]
 
-        if info == ReturnCode.Success
-            defect_norm = defect_estimate!(cache)
-            # The defect is greater than 10%, the solution is not acceptable
-            defect_norm > defect_threshold && (info = ReturnCode.Failure)
-        end
+    odesol = DiffEqBase.build_solution(cache.prob, cache.alg, cache.mesh, u;
+        interp = MIRKInterpolation(cache.mesh, u, cache), retcode = info)
+    return __build_solution(cache.prob, odesol, sol_nlprob)
+end
+
+function __perform_mirk_iteration(
+        cache::MIRKCache, abstol, adaptive; nlsolve_kwargs = (;), kwargs...)
+    nlprob = __construct_nlproblem(cache, recursive_flatten(cache.y₀))
+    nlsolve_alg = __concrete_nonlinearsolve_algorithm(nlprob, cache.alg.nlsolve)
+    sol_nlprob = __solve(
+        nlprob, nlsolve_alg; abstol, kwargs..., nlsolve_kwargs..., alias_u0 = true)
+    recursive_unflatten!(cache.y₀, sol_nlprob.u)
+
+    defect_norm = 2 * abstol
+
+    # Early terminate if non-adaptive
+    adaptive || return sol_nlprob, sol_nlprob.retcode, defect_norm
+
+    info = sol_nlprob.retcode
+
+    if info == ReturnCode.Success # Nonlinear Solve was successful
+        defect_norm = defect_estimate!(cache)
+        # The defect is greater than 10%, the solution is not acceptable
+        defect_norm > cache.alg.defect_threshold && (info = ReturnCode.Failure)
+    end
 
-        if info == ReturnCode.Success
-            if defect_norm > abstol
-                # We construct a new mesh to equidistribute the defect
-                mesh, mesh_dt, _, info = mesh_selector!(cache)
-                if info == ReturnCode.Success
-                    __append_similar!(cache.y₀, length(cache.mesh), cache.M)
-                    for (i, m) in enumerate(cache.mesh)
-                        interp_eval!(cache.y₀[i], cache, m, mesh, mesh_dt)
-                    end
-                    __expand_cache!(cache)
+    if info == ReturnCode.Success # Nonlinear Solve Successful and defect norm is acceptable
+        if defect_norm > abstol
+            # We construct a new mesh to equidistribute the defect
+            mesh, mesh_dt, _, info = mesh_selector!(cache)
+            if info == ReturnCode.Success
+                __append_similar!(cache.y₀, length(cache.mesh), cache.M)
+                for (i, m) in enumerate(cache.mesh)
+                    interp_eval!(cache.y₀[i], cache, m, mesh, mesh_dt)
                 end
-            end
-        else
-            #  We cannot obtain a solution for the current mesh
-            if 2 * (length(cache.mesh) - 1) > MxNsub
-                # New mesh would be too large
-                info = ReturnCode.Failure
-            else
-                half_mesh!(cache)
                 __expand_cache!(cache)
-                recursive_fill!(cache.y₀, 0)
-                info = ReturnCode.Success # Force a restart
-                defect_norm = 2 * abstol
             end
         end
+    else # Something bad happened
+        # We cannot obtain a solution for the current mesh
+        if 2 * (length(cache.mesh) - 1) > cache.alg.max_num_subintervals
+            # New mesh would be too large
+            info = ReturnCode.Failure
+        else
+            half_mesh!(cache)
+            __expand_cache!(cache)
+            recursive_fill!(cache.y₀, 0)
+            info = ReturnCode.Success # Force a restart
+        end
     end
 
-    u = [reshape(y, cache.in_size) for y in cache.y₀]
-    return DiffEqBase.build_solution(prob, alg, cache.mesh,
-        u; interp = MIRKInterpolation(cache.mesh, u, cache), retcode = info)
+    return sol_nlprob, info, defect_norm
 end
 
 # Constructing the Nonlinear Problem
@@ -192,30 +214,31 @@ function __construct_nlproblem(cache::MIRKCache{iip}, y::AbstractVector) where {
     pt = cache.problem_type
 
     loss_bc = if iip
-        (du, u, p) -> __mirk_loss_bc!(du, u, p, pt, cache.bc, cache.y, cache.mesh)
+        @closure (du, u, p) -> __mirk_loss_bc!(du, u, p, pt, cache.bc, cache.y, cache.mesh)
     else
-        (u, p) -> __mirk_loss_bc(u, p, pt, cache.bc, cache.y, cache.mesh)
+        @closure (u, p) -> __mirk_loss_bc(u, p, pt, cache.bc, cache.y, cache.mesh)
     end
 
     loss_collocation = if iip
-        (du, u, p) -> __mirk_loss_collocation!(du, u, p, cache.y, cache.mesh,
-            cache.residual, cache)
+        @closure (du, u, p) -> __mirk_loss_collocation!(
+            du, u, p, cache.y, cache.mesh, cache.residual, cache)
     else
-        (u, p) -> __mirk_loss_collocation(u, p, cache.y, cache.mesh, cache.residual, cache)
+        @closure (u, p) -> __mirk_loss_collocation(
+            u, p, cache.y, cache.mesh, cache.residual, cache)
     end
 
     loss = if iip
-        (du, u, p) -> __mirk_loss!(du, u, p, cache.y, pt, cache.bc, cache.residual,
-            cache.mesh, cache)
+        @closure (du, u, p) -> __mirk_loss!(
+            du, u, p, cache.y, pt, cache.bc, cache.residual, cache.mesh, cache)
     else
-        (u, p) -> __mirk_loss(u, p, cache.y, pt, cache.bc, cache.mesh, cache)
+        @closure (u, p) -> __mirk_loss(u, p, cache.y, pt, cache.bc, cache.mesh, cache)
     end
 
     return __construct_nlproblem(cache, y, loss_bc, loss_collocation, loss, pt)
 end
 
-function __mirk_loss!(resid, u, p, y, pt::StandardBVProblem, bc!::BC, residual, mesh,
-        cache) where {BC}
+@views function __mirk_loss!(
+        resid, u, p, y, pt::StandardBVProblem, bc!::BC, residual, mesh, cache) where {BC}
     y_ = recursive_unflatten!(y, u)
     resids = [get_tmp(r, u) for r in residual]
     eval_bc_residual!(resids[1], pt, bc!, y_, p, mesh)
@@ -224,46 +247,45 @@ function __mirk_loss!(resid, u, p, y, pt::StandardBVProblem, bc!::BC, residual,
     return nothing
 end
 
-function __mirk_loss!(
-        resid, u, p, y, pt::TwoPointBVProblem, bc!::Tuple{BC1, BC2}, residual,
-        mesh, cache) where {BC1, BC2}
+@views function __mirk_loss!(resid, u, p, y, pt::TwoPointBVProblem, bc!::Tuple{BC1, BC2},
+        residual, mesh, cache) where {BC1, BC2}
     y_ = recursive_unflatten!(y, u)
     resids = [get_tmp(r, u) for r in residual]
-    resida = @view resids[1][1:prod(cache.resid_size[1])]
-    residb = @view resids[1][(prod(cache.resid_size[1]) + 1):end]
+    resida = resids[1][1:prod(cache.resid_size[1])]
+    residb = resids[1][(prod(cache.resid_size[1]) + 1):end]
     eval_bc_residual!((resida, residb), pt, bc!, y_, p, mesh)
     Φ!(resids[2:end], cache, y_, u, p)
     recursive_flatten_twopoint!(resid, resids, cache.resid_size)
     return nothing
 end
 
-function __mirk_loss(u, p, y, pt::StandardBVProblem, bc::BC, mesh, cache) where {BC}
+@views function __mirk_loss(u, p, y, pt::StandardBVProblem, bc::BC, mesh, cache) where {BC}
     y_ = recursive_unflatten!(y, u)
     resid_bc = eval_bc_residual(pt, bc, y_, p, mesh)
     resid_co = Φ(cache, y_, u, p)
     return vcat(resid_bc, mapreduce(vec, vcat, resid_co))
 end
 
-function __mirk_loss(u, p, y, pt::TwoPointBVProblem, bc::Tuple{BC1, BC2}, mesh,
-        cache) where {BC1, BC2}
+@views function __mirk_loss(
+        u, p, y, pt::TwoPointBVProblem, bc::Tuple{BC1, BC2}, mesh, cache) where {BC1, BC2}
     y_ = recursive_unflatten!(y, u)
     resid_bca, resid_bcb = eval_bc_residual(pt, bc, y_, p, mesh)
     resid_co = Φ(cache, y_, u, p)
     return vcat(resid_bca, mapreduce(vec, vcat, resid_co), resid_bcb)
 end
 
-function __mirk_loss_bc!(resid, u, p, pt, bc!::BC, y, mesh) where {BC}
+@views function __mirk_loss_bc!(resid, u, p, pt, bc!::BC, y, mesh) where {BC}
     y_ = recursive_unflatten!(y, u)
     eval_bc_residual!(resid, pt, bc!, y_, p, mesh)
     return nothing
 end
 
-function __mirk_loss_bc(u, p, pt, bc!::BC, y, mesh) where {BC}
+@views function __mirk_loss_bc(u, p, pt, bc!::BC, y, mesh) where {BC}
     y_ = recursive_unflatten!(y, u)
     return eval_bc_residual(pt, bc!, y_, p, mesh)
 end
 
-function __mirk_loss_collocation!(resid, u, p, y, mesh, residual, cache)
+@views function __mirk_loss_collocation!(resid, u, p, y, mesh, residual, cache)
     y_ = recursive_unflatten!(y, u)
     resids = [get_tmp(r, u) for r in residual[2:end]]
     Φ!(resids, cache, y_, u, p)
@@ -271,7 +293,7 @@ function __mirk_loss_collocation!(resid, u, p, y, mesh, residual, cache)
     return nothing
 end
 
-function __mirk_loss_collocation(u, p, y, mesh, residual, cache)
+@views function __mirk_loss_collocation(u, p, y, mesh, residual, cache)
     y_ = recursive_unflatten!(y, u)
     resids = Φ(cache, y_, u, p)
     return mapreduce(vec, vcat, resids)
@@ -279,42 +301,43 @@ end
 
 function __construct_nlproblem(cache::MIRKCache{iip}, y, loss_bc::BC, loss_collocation::C,
         loss::LF, ::StandardBVProblem) where {iip, BC, C, LF}
-    @unpack nlsolve, jac_alg = cache.alg
+    (; nlsolve, jac_alg) = cache.alg
     N = length(cache.mesh)
 
     resid_bc = cache.bcresid_prototype
     L = length(resid_bc)
     resid_collocation = similar(y, cache.M * (N - 1))
 
-    loss_bcₚ = iip ? ((du, u) -> loss_bc(du, u, cache.p)) : (u -> loss_bc(u, cache.p))
-    loss_collocationₚ = iip ? ((du, u) -> loss_collocation(du, u, cache.p)) :
-                        (u -> loss_collocation(u, cache.p))
+    loss_bcₚ = (iip ? __Fix3 : Base.Fix2)(loss_bc, cache.p)
+    loss_collocationₚ = (iip ? __Fix3 : Base.Fix2)(loss_collocation, cache.p)
 
     sd_bc = jac_alg.bc_diffmode isa AbstractSparseADType ? SymbolicsSparsityDetection() :
             NoSparsityDetection()
-    cache_bc = __sparse_jacobian_cache(Val(iip), jac_alg.bc_diffmode, sd_bc, loss_bcₚ,
-        resid_bc, y)
+    cache_bc = __sparse_jacobian_cache(
+        Val(iip), jac_alg.bc_diffmode, sd_bc, loss_bcₚ, resid_bc, y)
 
     sd_collocation = if jac_alg.nonbc_diffmode isa AbstractSparseADType
         if L < cache.M
             # For underdetermined problems we use sparse since we don't have banded qr
-            colored_matrix = __generate_sparse_jacobian_prototype(cache,
-                cache.problem_type, y, y, cache.M, N)
+            colored_matrix = __generate_sparse_jacobian_prototype(
+                cache, cache.problem_type, y, y, cache.M, N)
             J_full_band = nothing
-            __sparsity_detection_alg(ColoredMatrix(sparse(colored_matrix.M),
-                colored_matrix.row_colorvec, colored_matrix.col_colorvec))
+            __sparsity_detection_alg(ColoredMatrix(
+                sparse(colored_matrix.M), colored_matrix.row_colorvec,
+                colored_matrix.col_colorvec))
         else
             J_full_band = BandedMatrix(Ones{eltype(y)}(L + cache.M * (N - 1), cache.M * N),
                 (L + 1, cache.M + max(cache.M - L, 0)))
-            __sparsity_detection_alg(__generate_sparse_jacobian_prototype(cache,
-                cache.problem_type, y, y, cache.M, N))
+            __sparsity_detection_alg(__generate_sparse_jacobian_prototype(
+                cache, cache.problem_type, y, y, cache.M, N))
         end
     else
         J_full_band = nothing
         NoSparsityDetection()
     end
-    cache_collocation = __sparse_jacobian_cache(Val(iip), jac_alg.nonbc_diffmode,
-        sd_collocation, loss_collocationₚ, resid_collocation, y)
+    cache_collocation = __sparse_jacobian_cache(
+        Val(iip), jac_alg.nonbc_diffmode, sd_collocation,
+        loss_collocationₚ, resid_collocation, y)
 
     J_bc = init_jacobian(cache_bc)
     J_c = init_jacobian(cache_collocation)
@@ -325,53 +348,54 @@ function __construct_nlproblem(cache::MIRKCache{iip}, y, loss_bc::BC, loss_collo
     end
 
     jac = if iip
-        (J, u, p) -> __mirk_mpoint_jacobian!(J, J_c, u, jac_alg.bc_diffmode,
-            jac_alg.nonbc_diffmode, cache_bc, cache_collocation, loss_bcₚ,
-            loss_collocationₚ, resid_bc, resid_collocation, L)
+        @closure (J, u, p) -> __mirk_mpoint_jacobian!(
+            J, J_c, u, jac_alg.bc_diffmode, jac_alg.nonbc_diffmode, cache_bc,
+            cache_collocation, loss_bcₚ, loss_collocationₚ, resid_bc, resid_collocation, L)
     else
-        (u, p) -> __mirk_mpoint_jacobian(jac_prototype, J_c, u, jac_alg.bc_diffmode,
-            jac_alg.nonbc_diffmode, cache_bc, cache_collocation, loss_bcₚ,
-            loss_collocationₚ, L)
+        @closure (u, p) -> __mirk_mpoint_jacobian(
+            jac_prototype, J_c, u, jac_alg.bc_diffmode, jac_alg.nonbc_diffmode,
+            cache_bc, cache_collocation, loss_bcₚ, loss_collocationₚ, L)
     end
 
-    nlf = NonlinearFunction{iip}(loss; resid_prototype = vcat(resid_bc, resid_collocation),
-        jac, jac_prototype)
-    return (L == cache.M ? NonlinearProblem : NonlinearLeastSquaresProblem)(nlf, y, cache.p)
+    resid_prototype = vcat(resid_bc, resid_collocation)
+    nlf = __unsafe_nonlinearfunction{iip}(loss; resid_prototype, jac, jac_prototype)
+
+    return __internal_nlsolve_problem(cache.prob, resid_prototype, y, nlf, y, cache.p)
 end
 
-function __mirk_mpoint_jacobian!(J, _, x, bc_diffmode, nonbc_diffmode, bc_diffcache,
-        nonbc_diffcache, loss_bc::BC, loss_collocation::C, resid_bc, resid_collocation,
-        L::Int) where {BC, C}
+function __mirk_mpoint_jacobian!(
+        J, _, x, bc_diffmode, nonbc_diffmode, bc_diffcache, nonbc_diffcache, loss_bc::BC,
+        loss_collocation::C, resid_bc, resid_collocation, L::Int) where {BC, C}
     sparse_jacobian!(@view(J[1:L, :]), bc_diffmode, bc_diffcache, loss_bc, resid_bc, x)
-    sparse_jacobian!(@view(J[(L + 1):end, :]), nonbc_diffmode, nonbc_diffcache,
-        loss_collocation, resid_collocation, x)
+    sparse_jacobian!(@view(J[(L + 1):end, :]), nonbc_diffmode,
+        nonbc_diffcache, loss_collocation, resid_collocation, x)
     return nothing
 end
 
-function __mirk_mpoint_jacobian!(
-        J::AlmostBandedMatrix, J_c, x, bc_diffmode, nonbc_diffmode,
-        bc_diffcache, nonbc_diffcache, loss_bc::BC, loss_collocation::C, resid_bc,
-        resid_collocation, L::Int) where {BC, C}
+function __mirk_mpoint_jacobian!(J::AlmostBandedMatrix, J_c, x, bc_diffmode, nonbc_diffmode,
+        bc_diffcache, nonbc_diffcache, loss_bc::BC, loss_collocation::C,
+        resid_bc, resid_collocation, L::Int) where {BC, C}
     J_bc = fillpart(J)
     sparse_jacobian!(J_bc, bc_diffmode, bc_diffcache, loss_bc, resid_bc, x)
-    sparse_jacobian!(J_c, nonbc_diffmode, nonbc_diffcache,
-        loss_collocation, resid_collocation, x)
+    sparse_jacobian!(
+        J_c, nonbc_diffmode, nonbc_diffcache, loss_collocation, resid_collocation, x)
     exclusive_bandpart(J) .= J_c
     finish_part_setindex!(J)
     return nothing
 end
 
-function __mirk_mpoint_jacobian(J, _, x, bc_diffmode, nonbc_diffmode, bc_diffcache,
-        nonbc_diffcache, loss_bc::BC, loss_collocation::C, L::Int) where {BC, C}
+function __mirk_mpoint_jacobian(
+        J, _, x, bc_diffmode, nonbc_diffmode, bc_diffcache, nonbc_diffcache,
+        loss_bc::BC, loss_collocation::C, L::Int) where {BC, C}
     sparse_jacobian!(@view(J[1:L, :]), bc_diffmode, bc_diffcache, loss_bc, x)
-    sparse_jacobian!(@view(J[(L + 1):end, :]), nonbc_diffmode, nonbc_diffcache,
-        loss_collocation, x)
+    sparse_jacobian!(
+        @view(J[(L + 1):end, :]), nonbc_diffmode, nonbc_diffcache, loss_collocation, x)
     return J
 end
 
-function __mirk_mpoint_jacobian(J::AlmostBandedMatrix, J_c, x, bc_diffmode, nonbc_diffmode,
-        bc_diffcache, nonbc_diffcache, loss_bc::BC, loss_collocation::C,
-        L::Int) where {BC, C}
+function __mirk_mpoint_jacobian(
+        J::AlmostBandedMatrix, J_c, x, bc_diffmode, nonbc_diffmode, bc_diffcache,
+        nonbc_diffcache, loss_bc::BC, loss_collocation::C, L::Int) where {BC, C}
     J_bc = fillpart(J)
     sparse_jacobian!(J_bc, bc_diffmode, bc_diffcache, loss_bc, x)
     sparse_jacobian!(J_c, nonbc_diffmode, nonbc_diffcache, loss_collocation, x)
@@ -382,7 +406,7 @@ end
 
 function __construct_nlproblem(cache::MIRKCache{iip}, y, loss_bc::BC, loss_collocation::C,
         loss::LF, ::TwoPointBVProblem) where {iip, BC, C, LF}
-    @unpack nlsolve, jac_alg = cache.alg
+    (; nlsolve, jac_alg) = cache.alg
     N = length(cache.mesh)
 
     lossₚ = iip ? ((du, u) -> loss(du, u, cache.p)) : (u -> loss(u, cache.p))
@@ -393,10 +417,11 @@ function __construct_nlproblem(cache::MIRKCache{iip}, y, loss_bc::BC, loss_collo
     L = length(cache.bcresid_prototype)
 
     sd = if jac_alg.diffmode isa AbstractSparseADType
-        __sparsity_detection_alg(__generate_sparse_jacobian_prototype(cache,
-            cache.problem_type, @view(cache.bcresid_prototype[1:prod(cache.resid_size[1])]),
-            @view(cache.bcresid_prototype[(prod(cache.resid_size[1]) + 1):end]), cache.M,
-            N))
+        __sparsity_detection_alg(__generate_sparse_jacobian_prototype(
+            cache, cache.problem_type,
+            @view(cache.bcresid_prototype[1:prod(cache.resid_size[1])]),
+            @view(cache.bcresid_prototype[(prod(cache.resid_size[1]) + 1):end]),
+            cache.M, N))
     else
         NoSparsityDetection()
     end
@@ -404,16 +429,16 @@ function __construct_nlproblem(cache::MIRKCache{iip}, y, loss_bc::BC, loss_collo
     jac_prototype = init_jacobian(diffcache)
 
     jac = if iip
-        (J, u, p) -> __mirk_2point_jacobian!(J, u, jac_alg.diffmode, diffcache, lossₚ,
-            resid)
+        @closure (J, u, p) -> __mirk_2point_jacobian!(
+            J, u, jac_alg.diffmode, diffcache, lossₚ, resid)
     else
-        (u, p) -> __mirk_2point_jacobian(u, jac_prototype, jac_alg.diffmode, diffcache,
-            lossₚ)
+        @closure (u, p) -> __mirk_2point_jacobian(
+            u, jac_prototype, jac_alg.diffmode, diffcache, lossₚ)
     end
 
-    nlf = NonlinearFunction{iip}(loss; resid_prototype = copy(resid), jac, jac_prototype)
-
-    return (L == cache.M ? NonlinearProblem : NonlinearLeastSquaresProblem)(nlf, y, cache.p)
+    resid_prototype = copy(resid)
+    nlf = __unsafe_nonlinearfunction{iip}(loss; resid_prototype, jac, jac_prototype)
+    return __internal_nlsolve_problem(cache.prob, resid_prototype, y, nlf, y, cache.p)
 end
 
 function __mirk_2point_jacobian!(J, x, diffmode, diffcache, loss_fn::L, resid) where {L}
diff --git a/src/solve/multiple_shooting.jl b/src/solve/multiple_shooting.jl
index 565e56fe..dbf76dfd 100644
--- a/src/solve/multiple_shooting.jl
+++ b/src/solve/multiple_shooting.jl
@@ -1,12 +1,17 @@
 function __solve(prob::BVProblem, _alg::MultipleShooting; odesolve_kwargs = (;),
         nlsolve_kwargs = (;), ensemblealg = EnsembleThreads(), verbose = true, kwargs...)
-    @unpack f, tspan = prob
+    (; f, tspan) = prob
 
-    @assert (ensemblealg isa EnsembleSerial)||(ensemblealg isa EnsembleThreads) "Currently MultipleShooting only supports `EnsembleSerial` and `EnsembleThreads`!"
+    if !(ensemblealg isa EnsembleSerial) && !(ensemblealg isa EnsembleThreads)
+        throw(ArgumentError("Currently MultipleShooting only supports `EnsembleSerial` and \
+                             `EnsembleThreads`!"))
+    end
 
     ig, T, N, Nig, u0 = __extract_problem_details(prob; dt = 0.1)
     has_initial_guess = _unwrap_val(ig)
 
+    @assert u0 isa AbstractVector "Non-Vector Inputs for Multiple-Shooting hasn't been implemented yet!"
+
     bcresid_prototype, resid_size = __get_bcresid_prototype(prob, u0)
     iip, bc, u0, u0_size = isinplace(prob), prob.f.bc, deepcopy(u0), size(u0)
 
@@ -30,41 +35,37 @@ function __solve(prob::BVProblem, _alg::MultipleShooting; odesolve_kwargs = (;),
 
     internal_ode_kwargs = (; verbose, kwargs..., odesolve_kwargs..., save_end = true)
 
-    function solve_internal_odes!(resid_nodes::T1, us::T2, p::T3, cur_nshoot::Int,
-            nodes::T4, odecache::C) where {T1, T2, T3, T4, C}
-        return __multiple_shooting_solve_internal_odes!(resid_nodes, us, cur_nshoot,
-            odecache, nodes, u0_size, N, ensemblealg)
-    end
+    solve_internal_odes! = @closure (resid_nodes, us, p, cur_nshoot, nodes, odecache) -> __multiple_shooting_solve_internal_odes!(
+        resid_nodes, us, cur_nshoot, odecache, nodes, u0_size, N, ensemblealg, tspan)
 
     # This gets all the nshoots except the final SingleShooting case
     all_nshoots = __get_all_nshoots(alg.grid_coarsening, nshoots)
     u_at_nodes, nodes = similar(u0, 0), typeof(first(tspan))[]
 
-    ode_cache_loss_fn = __multiple_shooting_init_odecache(ensemblealg, prob,
-        alg.ode_alg, u0, maximum(all_nshoots); internal_ode_kwargs...)
+    ode_cache_loss_fn = __multiple_shooting_init_odecache(
+        ensemblealg, prob, alg.ode_alg, u0, maximum(all_nshoots); internal_ode_kwargs...)
 
     for (i, cur_nshoot) in enumerate(all_nshoots)
         if i == 1
-            u_at_nodes = __multiple_shooting_initialize!(nodes, prob, alg, ig, nshoots,
-                ode_cache_loss_fn; kwargs..., verbose, odesolve_kwargs...)
+            u_at_nodes = __multiple_shooting_initialize!(
+                nodes, prob, alg, ig, nshoots, ode_cache_loss_fn;
+                kwargs..., verbose, odesolve_kwargs...)
         else
-            u_at_nodes = __multiple_shooting_initialize!(nodes, u_at_nodes, prob, alg,
-                cur_nshoot, all_nshoots[i - 1], ig, ode_cache_loss_fn, u0; kwargs...,
-                verbose, odesolve_kwargs...)
+            u_at_nodes = __multiple_shooting_initialize!(
+                nodes, u_at_nodes, prob, alg, cur_nshoot, all_nshoots[i - 1], ig,
+                ode_cache_loss_fn, u0; kwargs..., verbose, odesolve_kwargs...)
         end
 
         if prob.problem_type isa TwoPointBVProblem
-            __solve_nlproblem!(
-                prob.problem_type, alg, bcresid_prototype, u_at_nodes, nodes,
-                cur_nshoot, M, N, resida_len, residb_len, solve_internal_odes!, bc[1],
-                bc[2], prob, u0, ode_cache_loss_fn, ensemblealg, internal_ode_kwargs;
-                verbose, kwargs..., nlsolve_kwargs...)
+            __solve_nlproblem!(prob.problem_type, alg, bcresid_prototype, u_at_nodes, nodes,
+                cur_nshoot, M, N, resida_len, residb_len, solve_internal_odes!,
+                bc[1], bc[2], prob, u0, ode_cache_loss_fn, ensemblealg,
+                internal_ode_kwargs; verbose, kwargs..., nlsolve_kwargs...)
         else
-            __solve_nlproblem!(
-                prob.problem_type, alg, bcresid_prototype, u_at_nodes, nodes,
-                cur_nshoot, M, N, prod(resid_size), solve_internal_odes!, bc, prob, f,
-                u0_size, u0, ode_cache_loss_fn, ensemblealg, internal_ode_kwargs; verbose,
-                kwargs..., nlsolve_kwargs...)
+            __solve_nlproblem!(prob.problem_type, alg, bcresid_prototype, u_at_nodes, nodes,
+                cur_nshoot, M, N, prod(resid_size), solve_internal_odes!,
+                bc, prob, f, u0_size, u0, ode_cache_loss_fn, ensemblealg,
+                internal_ode_kwargs; verbose, kwargs..., nlsolve_kwargs...)
         end
     end
 
@@ -73,12 +74,12 @@ function __solve(prob::BVProblem, _alg::MultipleShooting; odesolve_kwargs = (;),
     else
         diffmode_shooting = __get_non_sparse_ad(alg.jac_alg.bc_diffmode)
     end
-    shooting_alg = Shooting(alg.ode_alg, alg.nlsolve,
-        BVPJacobianAlgorithm(diffmode_shooting))
+    shooting_alg = Shooting(
+        alg.ode_alg, alg.nlsolve, BVPJacobianAlgorithm(diffmode_shooting))
 
     single_shooting_prob = remake(prob; u0 = reshape(u_at_nodes[1:N], u0_size))
-    return __solve(single_shooting_prob, shooting_alg; odesolve_kwargs, nlsolve_kwargs,
-        verbose, kwargs...)
+    return __solve(single_shooting_prob, shooting_alg; odesolve_kwargs,
+        nlsolve_kwargs, verbose, kwargs...)
 end
 
 # TODO: We can save even more memory by hoisting the preallocated caches for the ODEs
@@ -86,48 +87,49 @@ end
 # TODO: But we can do it another day. Currently the gains here are quite high to justify
 # TODO: waiting.
 
-function __solve_nlproblem!(::TwoPointBVProblem, alg::MultipleShooting, bcresid_prototype,
-        u_at_nodes, nodes, cur_nshoot::Int, M::Int, N::Int, resida_len::Int,
-        residb_len::Int, solve_internal_odes!::S, bca::B1, bcb::B2, prob, u0,
-        ode_cache_loss_fn, ensemblealg, internal_ode_kwargs; kwargs...) where {B1, B2, S}
+function __solve_nlproblem!(
+        ::TwoPointBVProblem, alg::MultipleShooting, bcresid_prototype, u_at_nodes,
+        nodes, cur_nshoot::Int, M::Int, N::Int, resida_len::Int, residb_len::Int,
+        solve_internal_odes!::S, bca::B1, bcb::B2, prob, u0, ode_cache_loss_fn,
+        ensemblealg, internal_ode_kwargs; kwargs...) where {B1, B2, S}
     if __any_sparse_ad(alg.jac_alg)
-        J_proto = __generate_sparse_jacobian_prototype(alg, prob.problem_type,
-            bcresid_prototype, u0, N, cur_nshoot)
+        J_proto = __generate_sparse_jacobian_prototype(
+            alg, prob.problem_type, bcresid_prototype, u0, N, cur_nshoot)
     end
 
-    resid_prototype = vcat(bcresid_prototype[1],
-        similar(u_at_nodes, cur_nshoot * N), bcresid_prototype[2])
+    resid_prototype = vcat(
+        bcresid_prototype[1], similar(u_at_nodes, cur_nshoot * N), bcresid_prototype[2])
 
-    loss_fn = (du, u, p) -> __multiple_shooting_2point_loss!(du, u, p, cur_nshoot,
-        nodes, prob, solve_internal_odes!, resida_len, residb_len, N, bca, bcb,
-        ode_cache_loss_fn)
+    loss_fn = @closure (du, u, p) -> __multiple_shooting_2point_loss!(
+        du, u, p, cur_nshoot, nodes, prob, solve_internal_odes!,
+        resida_len, residb_len, N, bca, bcb, ode_cache_loss_fn)
 
     sd_bvp = alg.jac_alg.diffmode isa AbstractSparseADType ?
              __sparsity_detection_alg(J_proto) : NoSparsityDetection()
 
     resid_prototype_cached = similar(resid_prototype)
-    jac_cache = sparse_jacobian_cache(alg.jac_alg.diffmode, sd_bvp, nothing,
-        resid_prototype_cached, u_at_nodes)
+    jac_cache = sparse_jacobian_cache(
+        alg.jac_alg.diffmode, sd_bvp, nothing, resid_prototype_cached, u_at_nodes)
     jac_prototype = init_jacobian(jac_cache)
 
-    ode_cache_jac_fn = __multiple_shooting_init_jacobian_odecache(ensemblealg, prob,
-        jac_cache, alg.jac_alg.diffmode, alg.ode_alg, cur_nshoot, u0;
-        internal_ode_kwargs...)
+    ode_cache_jac_fn = __multiple_shooting_init_jacobian_odecache(
+        ensemblealg, prob, jac_cache, alg.jac_alg.diffmode,
+        alg.ode_alg, cur_nshoot, u0; internal_ode_kwargs...)
 
-    loss_fnₚ = (du, u) -> __multiple_shooting_2point_loss!(du, u, prob.p, cur_nshoot,
-        nodes, prob, solve_internal_odes!, resida_len, residb_len, N, bca, bcb,
-        ode_cache_jac_fn)
+    loss_fnₚ = @closure (du, u) -> __multiple_shooting_2point_loss!(
+        du, u, prob.p, cur_nshoot, nodes, prob, solve_internal_odes!,
+        resida_len, residb_len, N, bca, bcb, ode_cache_jac_fn)
 
-    jac_fn = (J, u, p) -> __multiple_shooting_2point_jacobian!(J, u, p, jac_cache,
-        loss_fnₚ, resid_prototype_cached, alg)
+    jac_fn = @closure (J, u, p) -> __multiple_shooting_2point_jacobian!(
+        J, u, p, jac_cache, loss_fnₚ, resid_prototype_cached, alg)
 
-    loss_function! = NonlinearFunction{true}(loss_fn; resid_prototype, jac = jac_fn,
-        jac_prototype)
+    loss_function! = __unsafe_nonlinearfunction{true}(
+        loss_fn; resid_prototype, jac = jac_fn, jac_prototype)
 
     # NOTE: u_at_nodes is updated inplace
-    nlprob = (M != N ? NonlinearLeastSquaresProblem : NonlinearProblem)(loss_function!,
-        u_at_nodes, prob.p)
-    __solve(nlprob, alg.nlsolve; kwargs..., alias_u0 = true)
+    nlprob = __internal_nlsolve_problem(prob, M, N, loss_function!, u_at_nodes, prob.p)
+    nlsolve_alg = __concrete_nonlinearsolve_algorithm(nlprob, alg.nlsolve)
+    __solve(nlprob, nlsolve_alg; kwargs..., alias_u0 = true)
 
     return nothing
 end
@@ -137,79 +139,77 @@ function __solve_nlproblem!(::StandardBVProblem, alg::MultipleShooting, bcresid_
         solve_internal_odes!::S, bc::BC, prob, f::F, u0_size, u0, ode_cache_loss_fn,
         ensemblealg, internal_ode_kwargs; kwargs...) where {BC, F, S}
     if __any_sparse_ad(alg.jac_alg)
-        J_proto = __generate_sparse_jacobian_prototype(alg, prob.problem_type,
-            bcresid_prototype, u0, N, cur_nshoot)
+        J_proto = __generate_sparse_jacobian_prototype(
+            alg, prob.problem_type, bcresid_prototype, u0, N, cur_nshoot)
     end
     resid_prototype = vcat(bcresid_prototype, similar(u_at_nodes, cur_nshoot * N))
 
     __resid_nodes = resid_prototype[(end - cur_nshoot * N + 1):end]
-    resid_nodes = __maybe_allocate_diffcache(__resid_nodes,
-        pickchunksize((cur_nshoot + 1) * N), alg.jac_alg.bc_diffmode)
+    resid_nodes = __maybe_allocate_diffcache(
+        __resid_nodes, pickchunksize((cur_nshoot + 1) * N), alg.jac_alg.bc_diffmode)
 
-    loss_fn = (du, u, p) -> __multiple_shooting_mpoint_loss!(du, u, p, cur_nshoot,
-        nodes, prob, solve_internal_odes!, resid_len, N, f, bc, u0_size, prob.tspan,
-        alg.ode_alg, u0, ode_cache_loss_fn)
+    loss_fn = @closure (du, u, p) -> __multiple_shooting_mpoint_loss!(
+        du, u, p, cur_nshoot, nodes, prob, solve_internal_odes!, resid_len,
+        N, f, bc, u0_size, prob.tspan, alg.ode_alg, u0, ode_cache_loss_fn)
 
     # ODE Part
     sd_ode = alg.jac_alg.nonbc_diffmode isa AbstractSparseADType ?
              __sparsity_detection_alg(J_proto) : NoSparsityDetection()
-    ode_jac_cache = sparse_jacobian_cache(alg.jac_alg.nonbc_diffmode, sd_ode,
-        nothing, similar(u_at_nodes, cur_nshoot * N), u_at_nodes)
-    ode_cache_ode_jac_fn = __multiple_shooting_init_jacobian_odecache(ensemblealg, prob,
-        ode_jac_cache, alg.jac_alg.nonbc_diffmode, alg.ode_alg, cur_nshoot, u0;
-        internal_ode_kwargs...)
+    ode_jac_cache = sparse_jacobian_cache(alg.jac_alg.nonbc_diffmode, sd_ode, nothing,
+        similar(u_at_nodes, cur_nshoot * N), u_at_nodes)
+    ode_cache_ode_jac_fn = __multiple_shooting_init_jacobian_odecache(
+        ensemblealg, prob, ode_jac_cache, alg.jac_alg.nonbc_diffmode,
+        alg.ode_alg, cur_nshoot, u0; internal_ode_kwargs...)
 
     # BC Part
     sd_bc = alg.jac_alg.bc_diffmode isa AbstractSparseADType ?
             SymbolicsSparsityDetection() : NoSparsityDetection()
-    bc_jac_cache = sparse_jacobian_cache(alg.jac_alg.bc_diffmode,
-        sd_bc, nothing, similar(bcresid_prototype), u_at_nodes)
-    ode_cache_bc_jac_fn = __multiple_shooting_init_jacobian_odecache(ensemblealg, prob,
-        bc_jac_cache, alg.jac_alg.bc_diffmode, alg.ode_alg, cur_nshoot, u0;
-        internal_ode_kwargs...)
+    bc_jac_cache = sparse_jacobian_cache(
+        alg.jac_alg.bc_diffmode, sd_bc, nothing, similar(bcresid_prototype), u_at_nodes)
+    ode_cache_bc_jac_fn = __multiple_shooting_init_jacobian_odecache(
+        ensemblealg, prob, bc_jac_cache, alg.jac_alg.bc_diffmode,
+        alg.ode_alg, cur_nshoot, u0; internal_ode_kwargs...)
 
     jac_prototype = vcat(init_jacobian(bc_jac_cache), init_jacobian(ode_jac_cache))
 
     # Define the functions now
-    ode_fn = (du, u) -> solve_internal_odes!(du, u, prob.p, cur_nshoot, nodes,
-        ode_cache_ode_jac_fn)
-    bc_fn = (du, u) -> __multiple_shooting_mpoint_loss_bc!(
-        du, u, prob.p, cur_nshoot, nodes,
-        prob, solve_internal_odes!, N, f, bc, u0_size, prob.tspan, alg.ode_alg, u0,
-        ode_cache_bc_jac_fn)
+    ode_fn = @closure (du, u) -> solve_internal_odes!(
+        du, u, prob.p, cur_nshoot, nodes, ode_cache_ode_jac_fn)
+    bc_fn = @closure (du, u) -> __multiple_shooting_mpoint_loss_bc!(
+        du, u, prob.p, cur_nshoot, nodes, prob, solve_internal_odes!, N,
+        f, bc, u0_size, prob.tspan, alg.ode_alg, u0, ode_cache_bc_jac_fn)
 
-    jac_fn = (J, u, p) -> __multiple_shooting_mpoint_jacobian!(J, u, p,
-        similar(bcresid_prototype), resid_nodes, ode_jac_cache, bc_jac_cache,
-        ode_fn, bc_fn, alg, N, M)
+    jac_fn = @closure (J, u, p) -> __multiple_shooting_mpoint_jacobian!(
+        J, u, p, similar(bcresid_prototype), resid_nodes,
+        ode_jac_cache, bc_jac_cache, ode_fn, bc_fn, alg, N, M)
 
-    loss_function! = NonlinearFunction{true}(loss_fn; resid_prototype, jac = jac_fn,
-        jac_prototype)
+    loss_function! = __unsafe_nonlinearfunction{true}(
+        loss_fn; resid_prototype, jac_prototype, jac = jac_fn)
 
     # NOTE: u_at_nodes is updated inplace
-    nlprob = (M != N ? NonlinearLeastSquaresProblem : NonlinearProblem)(loss_function!,
-        u_at_nodes, prob.p)
-    __solve(nlprob, alg.nlsolve; kwargs..., alias_u0 = true)
+    nlprob = __internal_nlsolve_problem(prob, M, N, loss_function!, u_at_nodes, prob.p)
+    nlsolve_alg = __concrete_nonlinearsolve_algorithm(nlprob, alg.nlsolve)
+    __solve(nlprob, nlsolve_alg; kwargs..., alias_u0 = true)
 
     return nothing
 end
 
-function __multiple_shooting_init_odecache(::EnsembleSerial, prob, alg, u0, nshoots;
-        kwargs...)
+function __multiple_shooting_init_odecache(
+        ::EnsembleSerial, prob, alg, u0, nshoots; kwargs...)
     odeprob = ODEProblem{isinplace(prob)}(prob.f, u0, prob.tspan, prob.p)
     return SciMLBase.__init(odeprob, alg; kwargs...)
 end
 
-function __multiple_shooting_init_odecache(::EnsembleThreads, prob, alg, u0, nshoots;
-        kwargs...)
+function __multiple_shooting_init_odecache(
+        ::EnsembleThreads, prob, alg, u0, nshoots; kwargs...)
     odeprob = ODEProblem{isinplace(prob)}(prob.f, u0, prob.tspan, prob.p)
     return [SciMLBase.__init(odeprob, alg; kwargs...)
             for _ in 1:min(Threads.nthreads(), nshoots)]
 end
 
-function __multiple_shooting_init_jacobian_odecache(ensemblealg, prob, jac_cache, ad, alg,
-        nshoots, u; kwargs...)
-    return __multiple_shooting_init_odecache(ensemblealg, prob, alg, u, nshoots;
-        kwargs...)
+function __multiple_shooting_init_jacobian_odecache(
+        ensemblealg, prob, jac_cache, ad, alg, nshoots, u; kwargs...)
+    return __multiple_shooting_init_odecache(ensemblealg, prob, alg, u, nshoots; kwargs...)
 end
 
 function __multiple_shooting_init_jacobian_odecache(ensemblealg, prob, jac_cache,
@@ -221,13 +221,14 @@ function __multiple_shooting_init_jacobian_odecache(ensemblealg, prob, jac_cache
         xduals = reshape(cache.t[1:length(u)], size(u))
     end
     fill!(xduals, 0)
-    return __multiple_shooting_init_odecache(ensemblealg, prob, alg, xduals, nshoots;
-        kwargs...)
+    return __multiple_shooting_init_odecache(
+        ensemblealg, prob, alg, xduals, nshoots; kwargs...)
 end
 
 # Not using `EnsembleProblem` since it is hard to initialize the cache and stuff
-function __multiple_shooting_solve_internal_odes!(resid_nodes, us, cur_nshoots::Int,
-        odecache, nodes, u0_size, N::Int, ::EnsembleSerial)
+function __multiple_shooting_solve_internal_odes!(
+        resid_nodes, us, cur_nshoots::Int, odecache,
+        nodes, u0_size, N::Int, ::EnsembleSerial, tspan)
     ts_ = Vector{Vector{typeof(first(tspan))}}(undef, cur_nshoots)
     us_ = Vector{Vector{typeof(us)}}(undef, cur_nshoots)
 
@@ -244,8 +245,9 @@ function __multiple_shooting_solve_internal_odes!(resid_nodes, us, cur_nshoots::
     return reduce(vcat, us_), reduce(vcat, ts_)
 end
 
-function __multiple_shooting_solve_internal_odes!(resid_nodes, us, cur_nshoots::Int,
-        odecache::Vector, nodes, u0_size, N::Int, ::EnsembleThreads)
+function __multiple_shooting_solve_internal_odes!(
+        resid_nodes, us, cur_nshoots::Int, odecache::Vector,
+        nodes, u0_size, N::Int, ::EnsembleThreads, tspan)
     ts_ = Vector{Vector{typeof(first(tspan))}}(undef, cur_nshoots)
     us_ = Vector{Vector{typeof(us)}}(undef, cur_nshoots)
 
@@ -257,11 +259,10 @@ function __multiple_shooting_solve_internal_odes!(resid_nodes, us, cur_nshoots::
         return first:1:last
     end
 
-    Threads.@threads for idx in 1:length(data_partition)
+    Threads.@threads for idx in eachindex(data_partition)
         cache = odecache[idx]
         for i in data_partition[idx]
-            SciMLBase.reinit!(
-                cache, reshape(@view(us[((i - 1) * N + 1):(i * N)]), u0_size);
+            SciMLBase.reinit!(cache, reshape(@view(us[((i - 1) * N + 1):(i * N)]), u0_size);
                 t0 = nodes[i], tf = nodes[i + 1])
             sol = solve!(cache)
             us_[i] = deepcopy(sol.u)
@@ -274,28 +275,28 @@ function __multiple_shooting_solve_internal_odes!(resid_nodes, us, cur_nshoots::
     return reduce(vcat, us_), reduce(vcat, ts_)
 end
 
-function __multiple_shooting_2point_jacobian!(J, us, p, jac_cache, loss_fn::F, resid,
-        alg::MultipleShooting) where {F}
+function __multiple_shooting_2point_jacobian!(
+        J, us, p, jac_cache, loss_fn::F, resid, alg::MultipleShooting) where {F}
     sparse_jacobian!(J, alg.jac_alg.diffmode, jac_cache, loss_fn, resid, us)
     return nothing
 end
 
-function __multiple_shooting_mpoint_jacobian!(J, us, p, resid_bc, resid_nodes,
-        ode_jac_cache, bc_jac_cache, ode_fn::F1, bc_fn::F2, alg::MultipleShooting,
-        N::Int, M::Int) where {F1, F2}
+function __multiple_shooting_mpoint_jacobian!(
+        J, us, p, resid_bc, resid_nodes, ode_jac_cache, bc_jac_cache, ode_fn::F1,
+        bc_fn::F2, alg::MultipleShooting, N::Int, M::Int) where {F1, F2}
     J_bc = @view(J[1:M, :])
     J_c = @view(J[(M + 1):end, :])
 
-    sparse_jacobian!(J_c, alg.jac_alg.nonbc_diffmode, ode_jac_cache, ode_fn,
-        resid_nodes.du, us)
+    sparse_jacobian!(
+        J_c, alg.jac_alg.nonbc_diffmode, ode_jac_cache, ode_fn, resid_nodes.du, us)
     sparse_jacobian!(J_bc, alg.jac_alg.bc_diffmode, bc_jac_cache, bc_fn, resid_bc, us)
 
     return nothing
 end
 
-@views function __multiple_shooting_2point_loss!(resid, us, p, cur_nshoots::Int, nodes,
-        prob, solve_internal_odes!::S, resida_len, residb_len, N, bca::BCA, bcb::BCB,
-        ode_cache) where {S, BCA, BCB}
+@views function __multiple_shooting_2point_loss!(
+        resid, us, p, cur_nshoots::Int, nodes, prob, solve_internal_odes!::S,
+        resida_len, residb_len, N, bca::BCA, bcb::BCB, ode_cache) where {S, BCA, BCB}
     resid_ = resid[(resida_len + 1):(end - residb_len)]
     solve_internal_odes!(resid_, us, p, cur_nshoots, nodes, ode_cache)
 
@@ -316,9 +317,9 @@ end
     return nothing
 end
 
-@views function __multiple_shooting_mpoint_loss_bc!(resid_bc, us, p, cur_nshoots::Int,
-        nodes, prob, solve_internal_odes!::S, N, f::F, bc::BC, u0_size, tspan,
-        ode_alg, u0, ode_cache) where {S, F, BC}
+@views function __multiple_shooting_mpoint_loss_bc!(
+        resid_bc, us, p, cur_nshoots::Int, nodes, prob, solve_internal_odes!::S,
+        N, f::F, bc::BC, u0_size, tspan, ode_alg, u0, ode_cache) where {S, F, BC}
     iip = isinplace(prob)
     _resid_nodes = similar(us, cur_nshoots * N)
 
@@ -337,9 +338,9 @@ end
     return nothing
 end
 
-@views function __multiple_shooting_mpoint_loss!(resid, us, p, cur_nshoots::Int, nodes,
-        prob, solve_internal_odes!::S, resid_len, N, f::F, bc::BC, u0_size, tspan,
-        ode_alg, u0, ode_cache) where {S, F, BC}
+@views function __multiple_shooting_mpoint_loss!(
+        resid, us, p, cur_nshoots::Int, nodes, prob, solve_internal_odes!::S, resid_len,
+        N, f::F, bc::BC, u0_size, tspan, ode_alg, u0, ode_cache) where {S, F, BC}
     iip = isinplace(prob)
     resid_bc = resid[1:resid_len]
     resid_nodes = resid[(resid_len + 1):end]
@@ -360,16 +361,15 @@ end
 
 # Problem has initial guess
 @views function __multiple_shooting_initialize!(
-        nodes, prob, alg, ::Val{true}, nshoots::Int,
-        odecache; kwargs...)
-    @unpack u0, tspan = prob
+        nodes, prob, alg, ::Val{true}, nshoots::Int, odecache; kwargs...)
+    (; u0, tspan) = prob
 
     resize!(nodes, nshoots + 1)
     nodes .= range(tspan[1], tspan[2]; length = nshoots + 1)
 
     # NOTE: We don't check `u0 isa Function` since `u0` in-principle can be a callable
     #       struct
-    u0_ = u0 isa AbstractArray ? u0 : [__initial_guess(u0, prob.p, t) for t in nodes]
+    u0_ = u0 isa VectorOfArray ? u0 : [__initial_guess(u0, prob.p, t) for t in nodes]
 
     N = length(first(u0_))
     u_at_nodes = similar(first(u0_), (nshoots + 1) * N)
@@ -379,10 +379,11 @@ end
 end
 
 # No initial guess
-@views function __multiple_shooting_initialize!(nodes, prob, alg::MultipleShooting,
-        ::Val{false}, nshoots::Int, odecache_; verbose, kwargs...)
-    @unpack f, u0, tspan, p = prob
-    @unpack ode_alg = alg
+@views function __multiple_shooting_initialize!(
+        nodes, prob, alg::MultipleShooting, ::Val{false},
+        nshoots::Int, odecache_; verbose, kwargs...)
+    (; f, u0, tspan, p) = prob
+    (; ode_alg) = alg
 
     resize!(nodes, nshoots + 1)
     nodes .= range(tspan[1], tspan[2]; length = nshoots + 1)
@@ -404,14 +405,13 @@ end
     sol = solve!(odecache)
 
     if SciMLBase.successful_retcode(sol)
-        res = sol(nodes).u
-        for i in 1:length(nodes)
-            u_at_nodes[(i - 1) * N .+ (1:N)] .= vec(res[i])
+        for i in eachindex(nodes)
+            u_at_nodes[(i - 1) * N .+ (1:N)] .= vec(sol(nodes[i]))
         end
     else
         @warn "Initialization using odesolve failed. Initializing using 0s. It is \
                recommended to provide an initial guess function via \
-               `u0 = <function>(p, t)` or `u0 = <function>(t)` in this case."
+               `u0 = <function>(p, t)` in this case."
         fill!(u_at_nodes, 0)
     end
 
@@ -419,9 +419,9 @@ end
 end
 
 # Grid coarsening
-@views function __multiple_shooting_initialize!(nodes, u_at_nodes_prev, prob, alg,
-        nshoots, old_nshoots, ig, odecache_, u0; kwargs...)
-    @unpack f, tspan, p = prob
+@views function __multiple_shooting_initialize!(nodes, u_at_nodes_prev, prob, alg, nshoots,
+        old_nshoots, ig, odecache_, u0; kwargs...)
+    (; f, tspan, p) = prob
     prev_nodes = copy(nodes)
     odecache = odecache_ isa Vector ? first(odecache_) : odecache_
 
diff --git a/src/solve/single_shooting.jl b/src/solve/single_shooting.jl
index 758ae9ae..c969284b 100644
--- a/src/solve/single_shooting.jl
+++ b/src/solve/single_shooting.jl
@@ -1,8 +1,13 @@
 function __solve(prob::BVProblem, alg_::Shooting; odesolve_kwargs = (;),
         nlsolve_kwargs = (;), verbose = true, kwargs...)
-    ig, T, N, _, u0 = __extract_problem_details(prob; dt = 0.1)
-    _unwrap_val(ig) && verbose &&
-        @warn "Initial guess provided, but will be ignored for Shooting!"
+    # Setup the problem
+    if prob.u0 isa AbstractArray{<:Number}
+        u0 = prob.u0
+    else
+        verbose && @warn "Initial guess provided, but will be ignored for Shooting."
+        u0 = __extract_u0(prob.u0, prob.p, first(prob.tspan))
+    end
+    T, N = eltype(u0), length(u0)
 
     alg = concretize_jacobian_algorithm(alg_, prob)
 
@@ -11,67 +16,72 @@ function __solve(prob::BVProblem, alg_::Shooting; odesolve_kwargs = (;),
     resid_prototype = __vec(bcresid_prototype)
 
     # Construct the residual function
-    ode_kwargs = (; kwargs..., verbose, odesolve_kwargs...)
+    actual_ode_kwargs = (; kwargs..., verbose, odesolve_kwargs...)
+    # For TwoPointBVPs we don't need to save every step
+    if prob.problem_type isa TwoPointBVProblem
+        ode_kwargs = (; save_everystep = false, actual_ode_kwargs...)
+    else
+        ode_kwargs = (; actual_ode_kwargs...)
+    end
     internal_prob = ODEProblem{iip}(prob.f, u0, prob.tspan, prob.p)
     ode_cache_loss_fn = SciMLBase.__init(internal_prob, alg.ode_alg; ode_kwargs...)
 
     loss_fn = if iip
-        (du, u, p) -> __single_shooting_loss!(du, u, p, ode_cache_loss_fn, bc, u0_size,
-            prob.problem_type, resid_size)
+        @closure (du, u, p) -> __single_shooting_loss!(
+            du, u, p, ode_cache_loss_fn, bc, u0_size, prob.problem_type, resid_size)
     else
-        (u, p) -> __single_shooting_loss(u, p, ode_cache_loss_fn, bc, u0_size,
-            prob.problem_type)
+        @closure (u, p) -> __single_shooting_loss(
+            u, p, ode_cache_loss_fn, bc, u0_size, prob.problem_type)
     end
 
-    # Construct the jacobian function
-    # NOTE: We pass in a separate Jacobian Function because that allows us to cache the
-    #       the internal ode solve cache. This cache needs to be distinct from the regular
-    #       residual function cache
     sd = alg.jac_alg.diffmode isa AbstractSparseADType ? SymbolicsSparsityDetection() :
          NoSparsityDetection()
     y_ = similar(resid_prototype)
 
+    # Construct the jacobian function
+    # NOTE: We pass in a separate Jacobian Function because that allows us to cache the
+    #       the internal ode solve cache. This cache needs to be distinct from the regular
+    #       residual function cache
     jac_cache = if iip
         sparse_jacobian_cache(alg.jac_alg.diffmode, sd, nothing, y_, vec(u0))
     else
         sparse_jacobian_cache(alg.jac_alg.diffmode, sd, nothing, vec(u0); fx = y_)
     end
 
-    ode_cache_jac_fn = __single_shooting_jacobian_ode_cache(internal_prob, jac_cache,
-        alg.jac_alg.diffmode, u0, alg.ode_alg; ode_kwargs...)
+    ode_cache_jac_fn = __single_shooting_jacobian_ode_cache(
+        internal_prob, jac_cache, alg.jac_alg.diffmode, u0, alg.ode_alg; ode_kwargs...)
 
     jac_prototype = init_jacobian(jac_cache)
 
     loss_fnₚ = if iip
-        (du, u) -> __single_shooting_loss!(du, u, prob.p, ode_cache_jac_fn, bc, u0_size,
-            prob.problem_type, resid_size)
+        @closure (du, u) -> __single_shooting_loss!(
+            du, u, prob.p, ode_cache_jac_fn, bc, u0_size, prob.problem_type, resid_size)
     else
-        (u) -> __single_shooting_loss(u, prob.p, ode_cache_jac_fn, bc, u0_size,
-            prob.problem_type)
+        @closure (u) -> __single_shooting_loss(
+            u, prob.p, ode_cache_jac_fn, bc, u0_size, prob.problem_type)
     end
 
     jac_fn = if iip
-        (J, u, p) -> __single_shooting_jacobian!(J, u, jac_cache, alg.jac_alg.diffmode,
-            loss_fnₚ, y_)
+        @closure (J, u, p) -> __single_shooting_jacobian!(
+            J, u, jac_cache, alg.jac_alg.diffmode, loss_fnₚ, y_)
     else
-        (u, p) -> __single_shooting_jacobian(jac_prototype, u, jac_cache,
-            alg.jac_alg.diffmode, loss_fnₚ)
+        @closure (u, p) -> __single_shooting_jacobian(
+            jac_prototype, u, jac_cache, alg.jac_alg.diffmode, loss_fnₚ)
     end
 
-    nlf = NonlinearFunction{iip}(loss_fn; jac_prototype, resid_prototype, jac = jac_fn)
-    nlprob = if length(resid_prototype) == length(u0)
-        NonlinearProblem(nlf, vec(u0), prob.p)
-    else
-        NonlinearLeastSquaresProblem(nlf, vec(u0), prob.p)
-    end
-    opt = __solve(nlprob, alg.nlsolve; nlsolve_kwargs..., verbose, kwargs...)
+    nlf = __unsafe_nonlinearfunction{iip}(
+        loss_fn; jac_prototype, resid_prototype, jac = jac_fn)
+    nlprob = __internal_nlsolve_problem(prob, resid_prototype, u0, nlf, vec(u0), prob.p)
+    nlsolve_alg = __concrete_nonlinearsolve_algorithm(nlprob, alg.nlsolve)
+    nlsol = __solve(nlprob, nlsolve_alg; nlsolve_kwargs..., verbose, kwargs...)
 
-    SciMLBase.reinit!(ode_cache_loss_fn, reshape(opt.u, u0_size))
-    sol = solve!(ode_cache_loss_fn)
+    # There is no way to reinit with the same cache with different cache. But not saving
+    # the internal values gives a significant speedup. So we just create a new cache
+    internal_prob_final = ODEProblem{iip}(
+        prob.f, reshape(nlsol.u, u0_size), prob.tspan, prob.p)
+    odesol = __solve(internal_prob_final, alg.ode_alg; actual_ode_kwargs...)
 
-    !SciMLBase.successful_retcode(opt) &&
-        return SciMLBase.solution_new_retcode(sol, ReturnCode.Failure)
-    return sol
+    return __build_solution(prob, odesol, nlsol)
 end
 
 function __single_shooting_loss!(resid_, u0_, p, cache, bc::BC, u0_size,
@@ -103,7 +113,7 @@ end
 function __single_shooting_loss(u, p, cache, bc::BC, u0_size, pt) where {BC}
     SciMLBase.reinit!(cache, reshape(u, u0_size))
     odesol = solve!(cache)
-    return __safe_vec(eval_bc_residual(pt, bc, odesol, p))
+    return __vec(eval_bc_residual(pt, bc, odesol, p))
 end
 
 function __single_shooting_jacobian!(J, u, jac_cache, diffmode, loss_fn::L, fu) where {L}
@@ -117,12 +127,12 @@ function __single_shooting_jacobian(J, u, jac_cache, diffmode, loss_fn::L) where
 end
 
 function __single_shooting_jacobian_ode_cache(prob, jac_cache, alg, u0, ode_alg; kwargs...)
-    prob_ = remake(prob; u0)
-    return SciMLBase.__init(prob_, ode_alg; kwargs...)
+    return SciMLBase.__init(remake(prob; u0), ode_alg; kwargs...)
 end
 
-function __single_shooting_jacobian_ode_cache(prob, jac_cache,
-        ::Union{AutoForwardDiff, AutoSparseForwardDiff}, u0, ode_alg; kwargs...)
+function __single_shooting_jacobian_ode_cache(
+        prob, jac_cache, ::Union{AutoForwardDiff, AutoSparseForwardDiff},
+        u0, ode_alg; kwargs...)
     cache = jac_cache.cache
     if cache isa ForwardDiff.JacobianConfig
         xduals = cache.duals isa Tuple ? cache.duals[2] : cache.duals
@@ -130,6 +140,7 @@ function __single_shooting_jacobian_ode_cache(prob, jac_cache,
         xduals = cache.t
     end
     fill!(xduals, 0)
-    prob_ = remake(prob; u0 = reshape(xduals, size(u0)))
+    prob_ = remake(
+        prob; u0 = reshape(xduals, size(u0)), tspan = eltype(xduals).(prob.tspan))
     return SciMLBase.__init(prob_, ode_alg; kwargs...)
 end
diff --git a/src/sparse_jacobians.jl b/src/sparse_jacobians.jl
index 1983d010..27939969 100644
--- a/src/sparse_jacobians.jl
+++ b/src/sparse_jacobians.jl
@@ -32,8 +32,8 @@ Base.eltype(M::ColoredMatrix) = eltype(M.M)
 ColoredMatrix() = ColoredMatrix(nothing, nothing, nothing)
 
 function __sparsity_detection_alg(M::ColoredMatrix)
-    return PrecomputedJacobianColorvec(; jac_prototype = M.M, M.row_colorvec,
-        M.col_colorvec)
+    return PrecomputedJacobianColorvec(;
+        jac_prototype = M.M, M.row_colorvec, M.col_colorvec)
 end
 __sparsity_detection_alg(::ColoredMatrix{Nothing}) = NoSparsityDetection()
 
@@ -53,16 +53,16 @@ function __generate_sparse_jacobian_prototype(cache::MIRKCache, ya, yb, M, N)
     return __generate_sparse_jacobian_prototype(cache, cache.problem_type, ya, yb, M, N)
 end
 
-function __generate_sparse_jacobian_prototype(::MIRKCache, ::StandardBVProblem, ya, yb, M,
-        N)
+function __generate_sparse_jacobian_prototype(
+        ::MIRKCache, ::StandardBVProblem, ya, yb, M, N)
     fast_scalar_indexing(ya) ||
         error("Sparse Jacobians are only supported for Fast Scalar Index-able Arrays")
     J_c = BandedMatrix(Ones{eltype(ya)}(M * (N - 1), M * N), (1, 2M - 1))
     return ColoredMatrix(J_c, matrix_colors(J_c'), matrix_colors(J_c))
 end
 
-function __generate_sparse_jacobian_prototype(::MIRKCache, ::TwoPointBVProblem,
-        ya, yb, M, N)
+function __generate_sparse_jacobian_prototype(
+        ::MIRKCache, ::TwoPointBVProblem, ya, yb, M, N)
     fast_scalar_indexing(ya) ||
         error("Sparse Jacobians are only supported for Fast Scalar Index-able Arrays")
     J₁ = length(ya) + length(yb) + M * (N - 1)
@@ -82,10 +82,10 @@ end
 
 Returns a 3-Tuple:
 
-* Entire Jacobian Prototype (if Two-Point Problem) else `nothing`.
-* Sparse Non-BC Part Jacobian Prototype along with the column and row color vectors.
-* Sparse BC Part Jacobian Prototype along with the column and row color vectors (if
-  Two-Point Problem) else `nothing`.
+  - Entire Jacobian Prototype (if Two-Point Problem) else `nothing`.
+  - Sparse Non-BC Part Jacobian Prototype along with the column and row color vectors.
+  - Sparse BC Part Jacobian Prototype along with the column and row color vectors (if
+    Two-Point Problem) else `nothing`.
 """
 function __generate_sparse_jacobian_prototype(::MultipleShooting, ::StandardBVProblem,
         bcresid_prototype, u0, N::Int, nshoots::Int)
diff --git a/src/types.jl b/src/types.jl
index 0eb3db9b..d0982fed 100644
--- a/src/types.jl
+++ b/src/types.jl
@@ -13,8 +13,6 @@ struct MIRKTableau{sType, cType, vType, bType, xType}
     end
 end
 
-@truncate_stacktrace MIRKTableau 1
-
 struct MIRKInterpTableau{s, c, v, x, τ}
     s_star::s
     c_star::c
@@ -24,14 +22,12 @@ struct MIRKInterpTableau{s, c, v, x, τ}
 
     function MIRKInterpTableau(s_star, c_star, v_star, x_star, τ_star)
         @assert eltype(c_star) == eltype(v_star) == eltype(x_star)
-        return new{typeof(s_star), typeof(c_star), typeof(v_star), typeof(x_star),
-            typeof(τ_star)}(s_star,
-            c_star, v_star, x_star, τ_star)
+        return new{
+            typeof(s_star), typeof(c_star), typeof(v_star), typeof(x_star), typeof(τ_star)}(
+            s_star, c_star, v_star, x_star, τ_star)
     end
 end
 
-@truncate_stacktrace MIRKInterpTableau 1
-
 # Sparsity Detection
 @concrete struct BVPJacobianAlgorithm
     bc_diffmode
@@ -39,14 +35,40 @@ end
     diffmode
 end
 
+@inline __materialize_jacobian_algorithm(_, alg::BVPJacobianAlgorithm) = alg
+@inline __materialize_jacobian_algorithm(_, alg::ADTypes.AbstractADType) = BVPJacobianAlgorithm(alg)
+@inline __materialize_jacobian_algorithm(::Nothing, ::Nothing) = BVPJacobianAlgorithm()
+@inline function __materialize_jacobian_algorithm(nlsolve::N, ::Nothing) where {N}
+    ad = hasfield(N, :jacobian_ad) ? nlsolve.jacobian_ad : missing
+    return BVPJacobianAlgorithm(ad)
+end
+
+function Base.show(io::IO, alg::BVPJacobianAlgorithm)
+    print(io, "BVPJacobianAlgorithm(")
+    modifiers = String[]
+    if alg.diffmode !== nothing && alg.diffmode !== missing
+        push!(modifiers, "diffmode = $(__nameof(alg.diffmode))()")
+    else
+        if alg.nonbc_diffmode !== missing && alg.nonbc_diffmode !== nothing
+            push!(modifiers, "nonbc_diffmode = $(__nameof(alg.nonbc_diffmode))()")
+        end
+        if alg.bc_diffmode !== missing && alg.bc_diffmode !== nothing
+            push!(modifiers, "bc_diffmode = $(__nameof(alg.bc_diffmode))()")
+        end
+    end
+    print(io, join(modifiers, ", "))
+    print(io, ")")
+end
+
 __any_sparse_ad(ad) = ad isa AbstractSparseADType
 function __any_sparse_ad(jac_alg::BVPJacobianAlgorithm)
-    __any_sparse_ad(jac_alg.bc_diffmode) || __any_sparse_ad(jac_alg.nonbc_diffmode) ||
+    __any_sparse_ad(jac_alg.bc_diffmode) ||
+        __any_sparse_ad(jac_alg.nonbc_diffmode) ||
         __any_sparse_ad(jac_alg.diffmode)
 end
 
-function BVPJacobianAlgorithm(diffmode = missing; nonbc_diffmode = missing,
-        bc_diffmode = missing)
+function BVPJacobianAlgorithm(
+        diffmode = missing; nonbc_diffmode = missing, bc_diffmode = missing)
     if diffmode !== missing
         bc_diffmode = bc_diffmode === missing ? diffmode : bc_diffmode
         nonbc_diffmode = nonbc_diffmode === missing ? diffmode : nonbc_diffmode
@@ -74,10 +96,9 @@ function concrete_jacobian_algorithm(jac_alg::BVPJacobianAlgorithm, prob::BVProb
     return concrete_jacobian_algorithm(jac_alg, prob.problem_type, prob, alg)
 end
 
-function concrete_jacobian_algorithm(jac_alg::BVPJacobianAlgorithm, prob_type,
-        prob::BVProblem, alg)
-    u0 = prob.u0 isa AbstractArray ? prob.u0 :
-         __initial_guess(prob.u0, prob.p, first(prob.tspan))
+function concrete_jacobian_algorithm(
+        jac_alg::BVPJacobianAlgorithm, prob_type, prob::BVProblem, alg)
+    u0 = __extract_u0(prob.u0, prob.p, first(prob.tspan))
     diffmode = jac_alg.diffmode === nothing ? __default_sparse_ad(u0) : jac_alg.diffmode
     bc_diffmode = jac_alg.bc_diffmode === nothing ?
                   (prob_type isa TwoPointBVProblem ? __default_sparse_ad :
@@ -121,19 +142,15 @@ function concretize_jacobian_algorithm(alg, prob)
     return alg
 end
 
-function MIRKJacobianComputationAlgorithm(diffmode = missing;
-        collocation_diffmode = missing, bc_diffmode = missing)
-    Base.depwarn("`MIRKJacobianComputationAlgorithm` has been deprecated in favor of \
-        `BVPJacobianAlgorithm`. Replace `collocation_diffmode` with `nonbc_diffmode",
-        :MIRKJacobianComputationAlgorithm)
-    return BVPJacobianAlgorithm(diffmode; nonbc_diffmode = collocation_diffmode,
-        bc_diffmode)
-end
+Base.@deprecate MIRKJacobianComputationAlgorithm(
+    diffmode = missing; collocation_diffmode = missing, bc_diffmode = missing) BVPJacobianAlgorithm(
+    diffmode; nonbc_diffmode = collocation_diffmode, bc_diffmode)
 
 __needs_diffcache(::Union{AutoForwardDiff, AutoSparseForwardDiff}) = true
 __needs_diffcache(_) = false
 function __needs_diffcache(jac_alg::BVPJacobianAlgorithm)
-    return __needs_diffcache(jac_alg.diffmode) || __needs_diffcache(jac_alg.bc_diffmode) ||
+    return __needs_diffcache(jac_alg.diffmode) ||
+           __needs_diffcache(jac_alg.bc_diffmode) ||
            __needs_diffcache(jac_alg.nonbc_diffmode)
 end
 
@@ -148,6 +165,14 @@ end
 __maybe_allocate_diffcache(x::DiffCache, chunksize) = DiffCache(similar(x.du), chunksize)
 __maybe_allocate_diffcache(x::FakeDiffCache, _) = FakeDiffCache(similar(x.du))
 
-PreallocationTools.get_tmp(dc::FakeDiffCache, _) = dc.du
-
 const MaybeDiffCache = Union{DiffCache, FakeDiffCache}
+
+## get_tmp shows a warning as it should on cache exapansion, this behavior however is
+## expected for adaptive BVP solvers so we write our own `get_tmp` and drop the warning logs
+@inline get_tmp(dc::FakeDiffCache, u) = dc.du
+
+@inline function get_tmp(dc, u)
+    return Logging.with_logger(Logging.NullLogger()) do
+        PreallocationTools.get_tmp(dc, u)
+    end
+end
diff --git a/src/utils.jl b/src/utils.jl
index 8d872bc2..bfa2d101 100644
--- a/src/utils.jl
+++ b/src/utils.jl
@@ -15,8 +15,8 @@ end
     end
     return y
 end
-@views function recursive_flatten_twopoint!(y::AbstractVector, x::Vector{<:AbstractArray},
-        sizes)
+@views function recursive_flatten_twopoint!(
+        y::AbstractVector, x::Vector{<:AbstractArray}, sizes)
     x_, xiter = Iterators.peel(x)
     copyto!(y[1:prod(sizes[1])], x_[1:prod(sizes[1])])
     i = prod(sizes[1])
@@ -63,9 +63,9 @@ end
 #       `w` to the GPU too many times. Instead if we iterate of w and w′ we save
 #       that cost. Our main cost is anyways going to be due to a large `u0` and
 #       we are going to use GPUs for that
-function __maybe_matmul!(z, A, b, α = eltype(z)(1), β = eltype(z)(0))
-    for j in eachindex(b)
-        z .= α .* A[:, j] .* b[j] .+ β .* z
+@views function __maybe_matmul!(z, A, b, α = eltype(z)(1), β = eltype(z)(0))
+    @simd ivdep for j in eachindex(b)
+        @inbounds @. z = α * A[:, j] * b[j] + β * z
     end
     return z
 end
@@ -85,16 +85,16 @@ function eval_bc_residual!(resid, pt, bc!::BC, sol, p) where {BC}
     return eval_bc_residual!(resid, pt, bc!, sol, p, sol.t)
 end
 eval_bc_residual!(resid, _, bc!::BC, sol, p, t) where {BC} = bc!(resid, sol, p, t)
-@views function eval_bc_residual!(resid, ::TwoPointBVProblem, (bca!, bcb!)::BC, sol, p,
-        t) where {BC}
+@views function eval_bc_residual!(
+        resid, ::TwoPointBVProblem, (bca!, bcb!)::BC, sol, p, t) where {BC}
     ua = sol isa AbstractVector ? sol[1] : sol(first(t))
     ub = sol isa AbstractVector ? sol[end] : sol(last(t))
     bca!(resid.resida, ua, p)
     bcb!(resid.residb, ub, p)
     return resid
 end
-@views function eval_bc_residual!(resid::Tuple, ::TwoPointBVProblem, (bca!, bcb!)::BC, sol,
-        p, t) where {BC}
+@views function eval_bc_residual!(
+        resid::Tuple, ::TwoPointBVProblem, (bca!, bcb!)::BC, sol, p, t) where {BC}
     ua = sol isa AbstractVector ? sol[1] : sol(first(t))
     ub = sol isa AbstractVector ? sol[end] : sol(last(t))
     bca!(resid[1], ua, p)
@@ -132,15 +132,21 @@ function __extract_problem_details(prob, u0::AbstractVector{<:AbstractArray}; kw
     _u0 = first(u0)
     return Val(true), eltype(_u0), length(_u0), (length(u0) - 1), _u0
 end
-function __extract_problem_details(prob, u0::AbstractArray; dt = 0.0,
-        check_positive_dt::Bool = false)
+function __extract_problem_details(
+        prob, u0::RecursiveArrayTools.AbstractVectorOfArray; kwargs...)
+    # Problem has Initial Guess
+    _u0 = first(u0.u)
+    return Val(true), eltype(_u0), length(_u0), (length(u0.u) - 1), _u0
+end
+function __extract_problem_details(
+        prob, u0::AbstractArray; dt = 0.0, check_positive_dt::Bool = false)
     # Problem does not have Initial Guess
     check_positive_dt && dt ≤ 0 && throw(ArgumentError("dt must be positive"))
     t₀, t₁ = prob.tspan
     return Val(false), eltype(u0), length(u0), Int(cld(t₁ - t₀, dt)), prob.u0
 end
-function __extract_problem_details(prob, f::F; dt = 0.0,
-        check_positive_dt::Bool = false) where {F <: Function}
+function __extract_problem_details(
+        prob, f::F; dt = 0.0, check_positive_dt::Bool = false) where {F <: Function}
     # Problem passes in a initial guess function
     check_positive_dt && dt ≤ 0 && throw(ArgumentError("dt must be positive"))
     u0 = __initial_guess(f, prob.p, prob.tspan[1])
@@ -149,9 +155,9 @@ function __extract_problem_details(prob, f::F; dt = 0.0,
 end
 
 function __initial_guess(f::F, p::P, t::T) where {F, P, T}
-    if static_hasmethod(f, Tuple{P, T})
+    if hasmethod(f, Tuple{P, T})
         return f(p, t)
-    elseif static_hasmethod(f, Tuple{T})
+    elseif hasmethod(f, Tuple{T})
         Base.depwarn("initial guess function must take 2 inputs `(p, t)` instead of just \
                      `t`. The single argument version has been deprecated and will be \
                      removed in the next major release of SciMLBase.",
@@ -162,19 +168,6 @@ function __initial_guess(f::F, p::P, t::T) where {F, P, T}
     end
 end
 
-function __initial_state_from_prob(prob::BVProblem, mesh)
-    return __initial_state_from_prob(prob, prob.u0, mesh)
-end
-function __initial_state_from_prob(::BVProblem, u0::AbstractArray, mesh)
-    return [copy(vec(u0)) for _ in mesh]
-end
-function __initial_state_from_prob(::BVProblem, u0::AbstractVector{<:AbstractVector}, _)
-    return [copy(vec(u)) for u in u0]
-end
-function __initial_state_from_prob(prob::BVProblem, f::F, mesh) where {F}
-    return [__initial_guess(f, prob.p, t) for t in mesh]
-end
-
 function __get_bcresid_prototype(prob::BVProblem, u)
     return __get_bcresid_prototype(prob.problem_type, prob, u)
 end
@@ -251,4 +244,155 @@ function __restructure_sol(sol::Vector{<:AbstractArray}, u_size)
     return map(Base.Fix2(reshape, u_size), sol)
 end
 
-# TODO: Add dispatch for a ODESolution Type as well
+# Override the checks for NonlinearFunction
+struct __unsafe_nonlinearfunction{iip} end
+
+@inline function __unsafe_nonlinearfunction{iip}(
+        f::F; jac::J = nothing, jac_prototype::JP = nothing, colorvec::CV = nothing,
+        resid_prototype::RP = nothing) where {iip, F, J, JP, CV, RP}
+    return NonlinearFunction{
+        iip, SciMLBase.FullSpecialize, F, Nothing, Nothing, Nothing, J, Nothing,
+        Nothing, JP, Nothing, Nothing, Nothing, Nothing, Nothing, CV, Nothing, RP}(
+        f, nothing, nothing, nothing, jac, nothing, nothing, jac_prototype, nothing,
+        nothing, nothing, nothing, nothing, colorvec, nothing, resid_prototype)
+end
+
+@inline __nameof(::T) where {T} = nameof(T)
+@inline __nameof(::Type{T}) where {T} = nameof(T)
+
+# Construct the internal NonlinearProblem
+@inline function __internal_nlsolve_problem(
+        ::BVProblem{uType, tType, iip, nlls}, resid_prototype,
+        u0, args...; kwargs...) where {uType, tType, iip, nlls}
+    if nlls
+        return NonlinearLeastSquaresProblem(args...; kwargs...)
+    else
+        return NonlinearProblem(args...; kwargs...)
+    end
+end
+
+@inline function __internal_nlsolve_problem(
+        bvp::BVProblem{uType, tType, iip, Nothing}, resid_prototype,
+        u0, args...; kwargs...) where {uType, tType, iip}
+    return __internal_nlsolve_problem(
+        bvp, length(resid_prototype), length(u0), args...; kwargs...)
+end
+
+@inline function __internal_nlsolve_problem(
+        ::BVProblem{uType, tType, iip, Nothing}, l1::Int,
+        l2::Int, args...; kwargs...) where {uType, tType, iip}
+    if l1 != l2
+        return NonlinearLeastSquaresProblem(args...; kwargs...)
+    else
+        return NonlinearProblem(args...; kwargs...)
+    end
+end
+
+# Handling Initial Guesses
+"""
+    __extract_u0(u₀, t₀)
+
+Takes the input initial guess and returns the value at the starting mesh point.
+"""
+@inline __extract_u0(u₀::AbstractVector{<:AbstractArray}, p, t₀) = u₀[1]
+@inline __extract_u0(u₀::VectorOfArray, p, t₀) = u₀[:, 1]
+@inline __extract_u0(u₀::DiffEqArray, p, t₀) = u₀.u[1]
+@inline __extract_u0(u₀::F, p, t₀) where {F <: Function} = __initial_guess(u₀, p, t₀)
+@inline __extract_u0(u₀::AbstractArray, p, t₀) = u₀
+@inline __extract_u0(u₀::T, p, t₀) where {T} = error("`prob.u0::$(T)` is not supported.")
+
+"""
+    __extract_mesh(u₀, t₀, t₁, n)
+
+Takes the input initial guess and returns the mesh.
+"""
+@inline __extract_mesh(u₀, t₀, t₁, n::Int) = collect(range(t₀; stop = t₁, length = n + 1))
+@inline __extract_mesh(u₀, t₀, t₁, dt::Number) = collect(t₀:dt:t₁)
+@inline __extract_mesh(u₀::DiffEqArray, t₀, t₁, ::Int) = u₀.t
+@inline __extract_mesh(u₀::DiffEqArray, t₀, t₁, ::Number) = u₀.t
+
+"""
+    __has_initial_guess(u₀) -> Bool
+
+Returns `true` if the input has an initial guess.
+"""
+@inline __has_initial_guess(u₀::AbstractVector{<:AbstractArray}) = true
+@inline __has_initial_guess(u₀::VectorOfArray) = true
+@inline __has_initial_guess(u₀::DiffEqArray) = true
+@inline __has_initial_guess(u₀::F) where {F} = true
+@inline __has_initial_guess(u₀::AbstractArray) = false
+
+"""
+    __initial_guess_length(u₀) -> Int
+
+Returns the length of the initial guess. If the initial guess is a function or no initial
+guess is supplied, it returns `-1`.
+"""
+@inline __initial_guess_length(u₀::AbstractVector{<:AbstractArray}) = length(u₀)
+@inline __initial_guess_length(u₀::VectorOfArray) = length(u₀)
+@inline __initial_guess_length(u₀::DiffEqArray) = length(u₀.t)
+@inline __initial_guess_length(u₀::F) where {F} = -1
+@inline __initial_guess_length(u₀::AbstractArray) = -1
+
+"""
+    __flatten_initial_guess(u₀) -> Union{AbstractMatrix, AbstractVector, Nothing}
+
+Flattens the initial guess into a matrix. For a function `u₀`, it returns `nothing`. For no
+initial guess, it returns `vec(u₀)`.
+"""
+@inline __flatten_initial_guess(u₀::AbstractVector{<:AbstractArray}) = mapreduce(
+    vec, hcat, u₀)
+@inline __flatten_initial_guess(u₀::VectorOfArray) = mapreduce(vec, hcat, u₀.u)
+@inline __flatten_initial_guess(u₀::DiffEqArray) = mapreduce(vec, hcat, u₀.u)
+@inline __flatten_initial_guess(u₀::AbstractArray) = vec(u₀)
+@inline __flatten_initial_guess(u₀::F) where {F} = nothing
+
+"""
+    __initial_guess_on_mesh(u₀, mesh, p, alias_u0::Bool)
+
+Returns the initial guess on the mesh. For `DiffEqArray` assumes that the mesh is the same
+as the mesh of the `DiffEqArray`.
+
+If `alias_u0` is set to `true`, we try our best to minimize copies. This means that `u₀`
+or parts of it will get mutated.
+"""
+@inline function __initial_guess_on_mesh(
+        u₀::AbstractVector{<:AbstractArray}, _, p, alias_u0::Bool)
+    return alias_u0 ? vec.(u₀) : [copy(vec(u)) for u in u₀]
+end
+@inline function __initial_guess_on_mesh(u₀::VectorOfArray, _, p, alias_u0::Bool)
+    return alias_u0 ? u₀.u : [copy(vec(u)) for u in u₀.u]
+end
+@inline function __initial_guess_on_mesh(u₀::DiffEqArray, mesh, p, alias_u0::Bool)
+    return alias_u0 ? u₀.u : [copy(vec(u)) for u in u₀.u]
+end
+@inline function __initial_guess_on_mesh(u₀::AbstractArray, mesh, p, alias_u0::Bool)
+    return [copy(vec(u₀)) for _ in mesh]
+end
+@inline function __initial_guess_on_mesh(u₀::F, mesh, p, alias_u0::Bool) where {F}
+    return [vec(__initial_guess(u₀, p, t)) for t in mesh]
+end
+
+# Construct BVP Solution
+function __build_solution(prob::BVProblem, odesol, nlsol)
+    retcode = ifelse(SciMLBase.successful_retcode(nlsol), odesol.retcode, nlsol.retcode)
+    return __solution_new_original_retcode(odesol, nlsol, retcode, nlsol.resid)
+end
+
+function __solution_new_original_retcode(
+        sol::ODESolution{T, N}, original, retcode, resid) where {T, N}
+    return ODESolution{
+        T, N, typeof(sol.u), typeof(sol.u_analytic), typeof(sol.errors), typeof(sol.t),
+        typeof(sol.k), typeof(sol.prob), typeof(sol.alg), typeof(sol.interp),
+        typeof(sol.stats), typeof(sol.alg_choice), typeof(resid), typeof(original)}(
+        sol.u, sol.u_analytic, sol.errors, sol.t, sol.k, sol.prob, sol.alg, sol.interp,
+        sol.dense, sol.tslocation, sol.stats, sol.alg_choice, retcode, resid, original)
+end
+
+# Fix3
+@concrete struct __Fix3
+    f
+    x
+end
+
+@inline (f::__Fix3{F})(a, b) where {F} = f.f(a, b, f.x)
diff --git a/test/mirk/ensemble.jl b/test/mirk/ensemble.jl
deleted file mode 100644
index 5d9151b0..00000000
--- a/test/mirk/ensemble.jl
+++ /dev/null
@@ -1,29 +0,0 @@
-using BoundaryValueDiffEq, Random, Test
-
-function ode!(du, u, p, t)
-    du[1] = u[2]
-    du[2] = -p[1] * u[1]
-end
-
-function bc!(residual, u, p, t)
-    residual[1] = u[1][1] - 1.0
-    residual[2] = u[end][1]
-end
-
-prob_func(prob, i, repeat) = remake(prob, p = [rand()])
-
-u0 = [0.0, 1.0]
-tspan = (0, pi / 2)
-p = [rand()]
-bvp = BVProblem(ode!, bc!, u0, tspan, p)
-ensemble_prob = EnsembleProblem(bvp; prob_func)
-
-@testset "$(solver)" for solver in (MIRK2, MIRK3, MIRK4, MIRK5, MIRK6)
-    jac_algs = [BVPJacobianAlgorithm(),
-        BVPJacobianAlgorithm(; bc_diffmode = AutoFiniteDiff(),
-            nonbc_diffmode = AutoSparseFiniteDiff())]
-    for jac_alg in jac_algs
-        sol = solve(ensemble_prob, solver(; jac_alg); trajectories = 10, dt = 0.1)
-        @test sol.converged
-    end
-end
diff --git a/test/mirk/ensemble_tests.jl b/test/mirk/ensemble_tests.jl
new file mode 100644
index 00000000..5c4bee54
--- /dev/null
+++ b/test/mirk/ensemble_tests.jl
@@ -0,0 +1,31 @@
+@testitem "EnsembleProblem" begin
+    using Random
+
+    function ode!(du, u, p, t)
+        du[1] = u[2]
+        du[2] = -p[1] * u[1]
+    end
+
+    function bc!(residual, u, p, t)
+        residual[1] = u[1][1] - 1.0
+        residual[2] = u[end][1]
+    end
+
+    prob_func(prob, i, repeat) = remake(prob, p = [rand()])
+
+    u0 = [0.0, 1.0]
+    tspan = (0, pi / 2)
+    p = [rand()]
+    bvp = BVProblem(ode!, bc!, u0, tspan, p)
+    ensemble_prob = EnsembleProblem(bvp; prob_func)
+
+    @testset "$(solver)" for solver in (MIRK2, MIRK3, MIRK4, MIRK5, MIRK6)
+        jac_algs = [BVPJacobianAlgorithm(),
+            BVPJacobianAlgorithm(;
+                bc_diffmode = AutoFiniteDiff(), nonbc_diffmode = AutoSparseFiniteDiff())]
+        for jac_alg in jac_algs
+            sol = solve(ensemble_prob, solver(; jac_alg); trajectories = 10, dt = 0.1)
+            @test sol.converged
+        end
+    end
+end
diff --git a/test/mirk/interpolation_test.jl b/test/mirk/interpolation_test.jl
deleted file mode 100644
index be4d852d..00000000
--- a/test/mirk/interpolation_test.jl
+++ /dev/null
@@ -1,34 +0,0 @@
-using BoundaryValueDiffEq, DiffEqBase, DiffEqDevTools, LinearAlgebra, Test
-
-λ = 1
-function prob_bvp_linear_analytic(u, λ, t)
-    a = 1 / sqrt(λ)
-    [(exp(-a * t) - exp((t - 2) * a)) / (1 - exp(-2 * a)),
-        (-a * exp(-t * a) - a * exp((t - 2) * a)) / (1 - exp(-2 * a))]
-end
-function prob_bvp_linear_f!(du, u, p, t)
-    du[1] = u[2]
-    du[2] = 1 / p * u[1]
-end
-function prob_bvp_linear_bc!(res, u, p, t)
-    res[1] = u[1][1] - 1
-    res[2] = u[end][1]
-end
-prob_bvp_linear_function = ODEFunction(prob_bvp_linear_f!,
-    analytic = prob_bvp_linear_analytic)
-prob_bvp_linear_tspan = (0.0, 1.0)
-prob_bvp_linear = BVProblem(prob_bvp_linear_function, prob_bvp_linear_bc!,
-    [1.0, 0.0], prob_bvp_linear_tspan, λ)
-testTol = 1e-6
-
-for order in (2, 3, 4, 5, 6)
-    s = Symbol("MIRK$(order)")
-    @eval mirk_solver(::Val{$order}) = $(s)()
-end
-
-@testset "Interpolation" begin
-    @testset "MIRK$order" for order in (2, 3, 4, 5, 6)
-        @time sol = solve(prob_bvp_linear, mirk_solver(Val(order)); dt = 0.001)
-        @test sol(0.001)≈[0.998687464, -1.312035941] atol=testTol
-    end
-end
diff --git a/test/mirk/mirk_basic_tests.jl b/test/mirk/mirk_basic_tests.jl
new file mode 100644
index 00000000..c80e380b
--- /dev/null
+++ b/test/mirk/mirk_basic_tests.jl
@@ -0,0 +1,252 @@
+@testsetup module MIRKConvergenceTests
+
+using BoundaryValueDiffEq
+
+for order in (2, 3, 4, 5, 6)
+    s = Symbol("MIRK$(order)")
+    @eval mirk_solver(::Val{$order}, args...; kwargs...) = $(s)(args...; kwargs...)
+end
+
+# First order test
+function f1!(du, u, p, t)
+    du[1] = u[2]
+    du[2] = 0
+end
+f1(u, p, t) = [u[2], 0]
+
+# Second order linear test
+function f2!(du, u, p, t)
+    du[1] = u[2]
+    du[2] = -u[1]
+end
+f2(u, p, t) = [u[2], -u[1]]
+
+function boundary!(residual, u, p, t)
+    residual[1] = u[1][1] - 5
+    residual[2] = u[end][1]
+end
+boundary(u, p, t) = [u[1][1] - 5, u[end][1]]
+
+function boundary_two_point_a!(resida, ua, p)
+    resida[1] = ua[1] - 5
+end
+function boundary_two_point_b!(residb, ub, p)
+    residb[1] = ub[1]
+end
+
+boundary_two_point_a(ua, p) = [ua[1] - 5]
+boundary_two_point_b(ub, p) = [ub[1]]
+
+# Not able to change the initial condition.
+# Hard coded solution.
+odef1! = ODEFunction(f1!, analytic = (u0, p, t) -> [5 - t, -1])
+odef1 = ODEFunction(f1, analytic = (u0, p, t) -> [5 - t, -1])
+
+odef2! = ODEFunction(f2!,
+    analytic = (u0, p, t) -> [
+        5 * (cos(t) - cot(5) * sin(t)), 5 * (-cos(t) * cot(5) - sin(t))])
+odef2 = ODEFunction(f2,
+    analytic = (u0, p, t) -> [
+        5 * (cos(t) - cot(5) * sin(t)), 5 * (-cos(t) * cot(5) - sin(t))])
+
+bcresid_prototype = (Array{Float64}(undef, 1), Array{Float64}(undef, 1))
+
+tspan = (0.0, 5.0)
+u0 = [5.0, -3.5]
+
+probArr = [BVProblem(odef1!, boundary!, u0, tspan, nlls = Val(false)),
+    BVProblem(odef1, boundary, u0, tspan, nlls = Val(false)),
+    BVProblem(odef2!, boundary!, u0, tspan, nlls = Val(false)),
+    BVProblem(odef2, boundary, u0, tspan, nlls = Val(false)),
+    TwoPointBVProblem(odef1!, (boundary_two_point_a!, boundary_two_point_b!),
+        u0, tspan; bcresid_prototype, nlls = Val(false)),
+    TwoPointBVProblem(odef1, (boundary_two_point_a, boundary_two_point_b),
+        u0, tspan; bcresid_prototype, nlls = Val(false)),
+    TwoPointBVProblem(odef2!, (boundary_two_point_a!, boundary_two_point_b!),
+        u0, tspan; bcresid_prototype, nlls = Val(false)),
+    TwoPointBVProblem(odef2, (boundary_two_point_a, boundary_two_point_b),
+        u0, tspan; bcresid_prototype, nlls = Val(false))]
+
+testTol = 0.2
+affineTol = 1e-2
+dts = 1 .// 2 .^ (3:-1:1)
+
+export probArr, testTol, affineTol, dts, mirk_solver
+
+end
+
+@testitem "Affineness" setup=[MIRKConvergenceTests] begin
+    using LinearAlgebra
+
+    @testset "Problem: $i" for i in (1, 2, 5, 6)
+        prob = probArr[i]
+        @testset "MIRK$order" for order in (2, 3, 4, 5, 6)
+            sol = solve(prob, mirk_solver(Val(order)); dt = 0.2)
+            @test norm(diff(first.(sol.u)) .+ 0.2, Inf) + abs(sol[1][1] - 5) < affineTol
+        end
+    end
+end
+
+@testitem "JET: Runtime Dispatches" setup=[MIRKConvergenceTests] begin
+    using JET
+
+    @testset "Problem: $i" for i in 1:8
+        prob = probArr[i]
+        @testset "MIRK$order" for order in (2, 3, 4, 5, 6)
+            solver = mirk_solver(Val(order); nlsolve = NewtonRaphson(),
+                jac_alg = BVPJacobianAlgorithm(AutoForwardDiff(; chunksize = 2)))
+            @test_opt target_modules=(NonlinearSolve, BoundaryValueDiffEq) solve(
+                prob, solver; dt = 0.2)
+            @test_call target_modules=(NonlinearSolve, BoundaryValueDiffEq) solve(
+                prob, solver; dt = 0.2)
+        end
+    end
+end
+
+@testitem "Convergence on Linear" setup=[MIRKConvergenceTests] begin
+    using LinearAlgebra, DiffEqDevTools
+
+    @testset "Problem: $i" for i in (3, 4, 7, 8)
+        prob = probArr[i]
+        @testset "MIRK$order" for (i, order) in enumerate((2, 3, 4, 5, 6))
+            sim = test_convergence(
+                dts, prob, mirk_solver(Val(order)); abstol = 1e-8, reltol = 1e-8)
+            @test sim.𝒪est[:final]≈order atol=testTol
+        end
+    end
+end
+
+# FIXME: This is a really bad test. Needs interpolation
+@testitem "Simple Pendulum" begin
+    using StaticArrays
+
+    tspan = (0.0, π / 2)
+    function simplependulum!(du, u, p, t)
+        g, L, θ, dθ = 9.81, 1.0, u[1], u[2]
+        du[1] = dθ
+        du[2] = -(g / L) * sin(θ)
+    end
+
+    function bc_pendulum!(residual, u, p, t)
+        residual[1] = u[end ÷ 2][1] + π / 2 # the solution at the middle of the time span should be -pi/2
+        residual[2] = u[end][1] - π / 2 # the solution at the end of the time span should be pi/2
+    end
+
+    u0 = MVector{2}([pi / 2, pi / 2])
+    bvp1 = BVProblem(simplependulum!, bc_pendulum!, u0, tspan)
+
+    jac_alg = BVPJacobianAlgorithm(;
+        bc_diffmode = AutoFiniteDiff(), nonbc_diffmode = AutoSparseFiniteDiff())
+
+    # Using ForwardDiff might lead to Cache expansion warnings
+    @test_nowarn solve(bvp1, MIRK2(; jac_alg); dt = 0.005)
+    @test_nowarn solve(bvp1, MIRK3(; jac_alg); dt = 0.005)
+    @test_nowarn solve(bvp1, MIRK4(; jac_alg); dt = 0.05)
+    @test_nowarn solve(bvp1, MIRK5(; jac_alg); dt = 0.05)
+    @test_nowarn solve(bvp1, MIRK6(; jac_alg); dt = 0.05)
+end
+
+@testitem "Interpolation" begin
+    using LinearAlgebra
+
+    λ = 1
+    function prob_bvp_linear_analytic(u, λ, t)
+        a = 1 / sqrt(λ)
+        return [(exp(-a * t) - exp((t - 2) * a)) / (1 - exp(-2 * a)),
+            (-a * exp(-t * a) - a * exp((t - 2) * a)) / (1 - exp(-2 * a))]
+    end
+
+    function prob_bvp_linear_f!(du, u, p, t)
+        du[1] = u[2]
+        du[2] = 1 / p * u[1]
+    end
+    function prob_bvp_linear_bc!(res, u, p, t)
+        res[1] = u[1][1] - 1
+        res[2] = u[end][1]
+    end
+
+    prob_bvp_linear_function = ODEFunction(
+        prob_bvp_linear_f!, analytic = prob_bvp_linear_analytic)
+    prob_bvp_linear_tspan = (0.0, 1.0)
+    prob_bvp_linear = BVProblem(
+        prob_bvp_linear_function, prob_bvp_linear_bc!, [1.0, 0.0], prob_bvp_linear_tspan, λ)
+    testTol = 1e-6
+
+    for order in (2, 3, 4, 5, 6)
+        s = Symbol("MIRK$(order)")
+        @eval mirk_solver(::Val{$order}) = $(s)()
+    end
+
+    @testset "MIRK$order" for order in (2, 3, 4, 5, 6)
+        sol = solve(prob_bvp_linear, mirk_solver(Val(order)); dt = 0.001)
+        @test sol(0.001)≈[0.998687464, -1.312035941] atol=testTol
+    end
+end
+
+@testitem "Swirling Flow III" begin
+    # Reported in https://github.com/SciML/BoundaryValueDiffEq.jl/issues/153
+    eps = 0.01
+    function swirling_flow!(du, u, p, t)
+        eps = p
+        du[1] = u[2]
+        du[2] = (u[1] * u[4] - u[3] * u[2]) / eps
+        du[3] = u[4]
+        du[4] = u[5]
+        du[5] = u[6]
+        du[6] = (-u[3] * u[6] - u[1] * u[2]) / eps
+        return
+    end
+
+    function swirling_flow_bc!(res, u, p, t)
+        res[1] = u[1][1] + 1.0
+        res[2] = u[1][3]
+        res[3] = u[1][4]
+        res[4] = u[end][1] - 1.0
+        res[5] = u[end][3]
+        res[6] = u[end][4]
+        return
+    end
+
+    tspan = (0.0, 1.0)
+    u0 = [0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
+    prob = BVProblem(swirling_flow!, swirling_flow_bc!, u0, tspan, eps)
+
+    @test_nowarn solve(prob, MIRK4(); dt = 0.01)
+end
+
+@testitem "Solve using Continuation" begin
+    using RecursiveArrayTools
+
+    g = 9.81
+    L = 1.0
+    tspan = (0.0, pi / 2)
+    function simplependulum!(du, u, p, t)
+        θ = u[1]
+        dθ = u[2]
+        du[1] = dθ
+        du[2] = -(g / L) * sin(θ)
+    end
+
+    function bc2a!(resid_a, u_a, p) # u_a is at the beginning of the time span
+        x0 = p
+        resid_a[1] = u_a[1] - x0 # the solution at the beginning of the time span should be -pi/2
+    end
+    function bc2b!(resid_b, u_b, p) # u_b is at the ending of the time span
+        x0 = p
+        resid_b[1] = u_b[1] - pi / 2 # the solution at the end of the time span should be pi/2
+    end
+
+    bvp3 = TwoPointBVProblem(
+        simplependulum!, (bc2a!, bc2b!), [pi / 2, pi / 2], (pi / 4, pi / 2),
+        -pi / 2; bcresid_prototype = (zeros(1), zeros(1)))
+    sol3 = solve(bvp3, MIRK4(), dt = 0.05)
+
+    # Needs a SciMLBase fix
+    bvp4 = TwoPointBVProblem(simplependulum!, (bc2a!, bc2b!), sol3, (0, pi / 2),
+        pi / 2; bcresid_prototype = (zeros(1), zeros(1)))
+    @test_broken solve(bvp4, MIRK4(), dt = 0.05) isa SciMLBase.ODESolution
+
+    bvp5 = TwoPointBVProblem(simplependulum!, (bc2a!, bc2b!), DiffEqArray(sol3.u, sol3.t),
+        (0, pi / 2), pi / 2; bcresid_prototype = (zeros(1), zeros(1)))
+    @test SciMLBase.successful_retcode(solve(bvp5, MIRK4(), dt = 0.05).retcode)
+end
diff --git a/test/mirk/mirk_convergence_tests.jl b/test/mirk/mirk_convergence_tests.jl
deleted file mode 100644
index c93b11b6..00000000
--- a/test/mirk/mirk_convergence_tests.jl
+++ /dev/null
@@ -1,126 +0,0 @@
-using BoundaryValueDiffEq, DiffEqBase, DiffEqDevTools, LinearAlgebra, Test
-
-for order in (2, 3, 4, 5, 6)
-    s = Symbol("MIRK$(order)")
-    @eval mirk_solver(::Val{$order}) = $(s)()
-end
-
-# First order test
-function f1!(du, u, p, t)
-    du[1] = u[2]
-    du[2] = 0
-end
-f1(u, p, t) = [u[2], 0]
-
-# Second order linear test
-function f2!(du, u, p, t)
-    du[1] = u[2]
-    du[2] = -u[1]
-end
-f2(u, p, t) = [u[2], -u[1]]
-
-function boundary!(residual, u, p, t)
-    residual[1] = u[1][1] - 5
-    residual[2] = u[end][1]
-end
-boundary(u, p, t) = [u[1][1] - 5, u[end][1]]
-
-function boundary_two_point_a!(resida, ua, p)
-    resida[1] = ua[1] - 5
-end
-function boundary_two_point_b!(residb, ub, p)
-    residb[1] = ub[1]
-end
-
-boundary_two_point_a(ua, p) = [ua[1] - 5]
-boundary_two_point_b(ub, p) = [ub[1]]
-
-# Not able to change the initial condition.
-# Hard coded solution.
-odef1! = ODEFunction(f1!, analytic = (u0, p, t) -> [5 - t, -1])
-odef1 = ODEFunction(f1, analytic = (u0, p, t) -> [5 - t, -1])
-
-odef2! = ODEFunction(f2!,
-    analytic = (u0, p, t) -> [
-        5 * (cos(t) - cot(5) * sin(t)),
-        5 * (-cos(t) * cot(5) - sin(t))
-    ])
-odef2 = ODEFunction(f2,
-    analytic = (u0, p, t) -> [
-        5 * (cos(t) - cot(5) * sin(t)),
-        5 * (-cos(t) * cot(5) - sin(t))
-    ])
-
-bcresid_prototype = (Array{Float64}(undef, 1), Array{Float64}(undef, 1))
-
-tspan = (0.0, 5.0)
-u0 = [5.0, -3.5]
-
-probArr = [
-    BVProblem(odef1!, boundary!, u0, tspan),
-    BVProblem(odef1, boundary, u0, tspan),
-    BVProblem(odef2!, boundary!, u0, tspan),
-    BVProblem(odef2, boundary, u0, tspan),
-    TwoPointBVProblem(odef1!, (boundary_two_point_a!, boundary_two_point_b!), u0, tspan;
-        bcresid_prototype),
-    TwoPointBVProblem(odef1, (boundary_two_point_a, boundary_two_point_b), u0, tspan;
-        bcresid_prototype),
-    TwoPointBVProblem(odef2!, (boundary_two_point_a!, boundary_two_point_b!), u0, tspan;
-        bcresid_prototype),
-    TwoPointBVProblem(odef2, (boundary_two_point_a, boundary_two_point_b), u0, tspan;
-        bcresid_prototype)
-];
-
-testTol = 0.2
-affineTol = 1e-2
-dts = 1 .// 2 .^ (3:-1:1)
-
-@testset "Affineness" begin
-    @testset "Problem: $i" for i in (1, 2, 5, 6)
-        prob = probArr[i]
-        @testset "MIRK$order" for order in (2, 3, 4, 5, 6)
-            @time sol = solve(prob, mirk_solver(Val(order)); dt = 0.2)
-            @test norm(diff(first.(sol.u)) .+ 0.2, Inf) + abs(sol[1][1] - 5) < affineTol
-        end
-    end
-end
-
-@testset "Convergence on Linear" begin
-    @testset "Problem: $i" for i in (3, 4, 7, 8)
-        prob = probArr[i]
-        @testset "MIRK$order" for (i, order) in enumerate((2, 3, 4, 5, 6))
-            @time sim = test_convergence(dts, prob, mirk_solver(Val(order));
-                abstol = 1e-8, reltol = 1e-8)
-            @test sim.𝒪est[:final]≈order atol=testTol
-        end
-    end
-end
-
-# Simple Pendulum
-using StaticArrays
-
-tspan = (0.0, π / 2)
-function simplependulum!(du, u, p, t)
-    g, L, θ, dθ = 9.81, 1.0, u[1], u[2]
-    du[1] = dθ
-    du[2] = -(g / L) * sin(θ)
-end
-
-# FIXME: This is a really bad test. Needs interpolation
-function bc_pendulum!(residual, u, p, t)
-    residual[1] = u[end ÷ 2][1] + π / 2 # the solution at the middle of the time span should be -pi/2
-    residual[2] = u[end][1] - π / 2 # the solution at the end of the time span should be pi/2
-end
-
-u0 = MVector{2}([pi / 2, pi / 2])
-bvp1 = BVProblem(simplependulum!, bc_pendulum!, u0, tspan)
-
-jac_alg = BVPJacobianAlgorithm(; bc_diffmode = AutoFiniteDiff(),
-    nonbc_diffmode = AutoSparseFiniteDiff())
-
-# Using ForwardDiff might lead to Cache expansion warnings
-@test_nowarn solve(bvp1, MIRK2(; jac_alg); dt = 0.005)
-@test_nowarn solve(bvp1, MIRK3(; jac_alg); dt = 0.005)
-@test_nowarn solve(bvp1, MIRK4(; jac_alg); dt = 0.05)
-@test_nowarn solve(bvp1, MIRK5(; jac_alg); dt = 0.05)
-@test_nowarn solve(bvp1, MIRK6(; jac_alg); dt = 0.05)
diff --git a/test/mirk/nonlinear_least_squares.jl b/test/mirk/nlls_tests.jl
similarity index 80%
rename from test/mirk/nonlinear_least_squares.jl
rename to test/mirk/nlls_tests.jl
index d3a69aa5..37c5e6c3 100644
--- a/test/mirk/nonlinear_least_squares.jl
+++ b/test/mirk/nlls_tests.jl
@@ -1,9 +1,9 @@
-using BoundaryValueDiffEq, LinearAlgebra, Test
+@testitem "Overconstrained BVP" begin
+    using LinearAlgebra
 
-@testset "Overconstrained BVP" begin
     SOLVERS = [mirk(; nlsolve)
                for mirk in (MIRK4, MIRK5, MIRK6),
-    nlsolve in (LevenbergMarquardt(), GaussNewton(), nothing)]
+    nlsolve in (LevenbergMarquardt(), GaussNewton(), TrustRegion(), nothing)]
 
     # OOP MP-BVP
     f1(u, p, t) = [u[2], -u[1]]
@@ -20,7 +20,7 @@ using BoundaryValueDiffEq, LinearAlgebra, Test
     bvp1 = BVProblem(BVPFunction{false}(f1, bc1; bcresid_prototype = zeros(3)), u0, tspan)
 
     for solver in SOLVERS
-        @time sol = solve(bvp1, solver; verbose = false, dt = 1.0)
+        sol = solve(bvp1, solver; verbose = false, dt = 1.0)
         @test norm(bc1(sol, nothing, tspan), Inf) < 1e-2
     end
 
@@ -44,7 +44,7 @@ using BoundaryValueDiffEq, LinearAlgebra, Test
     bvp2 = BVProblem(BVPFunction{true}(f1!, bc1!; bcresid_prototype = zeros(3)), u0, tspan)
 
     for solver in SOLVERS
-        @time sol = solve(bvp2, solver; verbose = false, dt = 1.0)
+        sol = solve(bvp2, solver; verbose = false, dt = 1.0)
         resid_f = Array{Float64}(undef, 3)
         bc1!(resid_f, sol, nothing, sol.t)
         @test norm(resid_f, Inf) < 1e-2
@@ -61,7 +61,7 @@ using BoundaryValueDiffEq, LinearAlgebra, Test
         tspan)
 
     for solver in SOLVERS
-        @time sol = solve(bvp3, solver; verbose = false, dt = 1.0)
+        sol = solve(bvp3, solver; verbose = false, dt = 1.0)
         @test norm(vcat(bc1a(sol[1], nothing), bc1b(sol[end], nothing)), Inf) < 1e-2
     end
 
@@ -76,8 +76,7 @@ using BoundaryValueDiffEq, LinearAlgebra, Test
         tspan)
 
     for solver in SOLVERS
-        @time sol = solve(bvp3, solver; verbose = false, dt = 1.0, abstol = 1e-3,
-            reltol = 1e-3)
+        sol = solve(bvp3, solver; verbose = false, dt = 1.0, abstol = 1e-3, reltol = 1e-3)
         resida = Array{Float64}(undef, 1)
         residb = Array{Float64}(undef, 2)
         bc1a!(resida, sol(0.0), nothing)
@@ -88,17 +87,18 @@ end
 
 # This is not a very meaningful problem, but it tests that our solvers are not throwing an
 # error
-# These tests are taking far too long currently and failing
-#=
-@testset "Underconstrained BVP: Rod BVP" begin
+@testitem "Underconstrained BVP: Rod BVP" begin
+    using LinearAlgebra
+
     # Force normal form for GN
-    SOLVERS = [mirk(; nlsolve) for mirk in (MIRK4, MIRK5, MIRK6),
-    nlsolve in (TrustRegion(), GaussNewton(), nothing)]
+    SOLVERS = [mirk(; nlsolve)
+               for mirk in (MIRK4, MIRK5, MIRK6),
+    nlsolve in (TrustRegion(), GaussNewton(), LevenbergMarquardt(), nothing)]
 
     function hat(y)
         return [0 -y[3] y[2]
-            y[3] 0 -y[1]
-            -y[2] y[1] 0]
+                y[3] 0 -y[1]
+                -y[2] y[1] 0]
     end
 
     function inv_hat(skew)
@@ -186,18 +186,16 @@ end
     rod_ode!(dy, y, p, t) = rod_ode!(dy, y, p, t, inv(Kse), inv(Kbt), rho, A, g)
     y0 = vcat(p0, R0, zeros(6))
     p = vcat(p0, R0, pL, RL)
-    prob_tp = TwoPointBVProblem(rod_ode!, (bc_a!, bc_b!), y0, tspan, p,
-        bcresid_prototype = (zeros(6), zeros(6)))
-    prob = BVProblem(BVPFunction(rod_ode!, bc!; bcresid_prototype = zeros(12)), y0, tspan,
-        p)
+    prob_tp = TwoPointBVProblem(
+        rod_ode!, (bc_a!, bc_b!), y0, tspan, p, bcresid_prototype = (zeros(6), zeros(6)))
+    prob = BVProblem(
+        BVPFunction(rod_ode!, bc!; bcresid_prototype = zeros(12)), y0, tspan, p)
 
     for solver in SOLVERS
-        @time sol = solve(prob_tp, solver; verbose = false, dt = 0.1, abstol = 1e-3,
-            reltol = 1e-3)
+        sol = solve(
+            prob_tp, solver; verbose = false, dt = 0.1, abstol = 1e-1, reltol = 1e-1)
         @test SciMLBase.successful_retcode(sol.retcode)
-        @time sol = solve(prob, solver; verbose = false, dt = 0.1, abstol = 1e-3,
-            reltol = 1e-3)
+        sol = solve(prob, solver; verbose = false, dt = 0.1, abstol = 1e-1, reltol = 1e-1)
         @test SciMLBase.successful_retcode(sol.retcode)
     end
 end
-=#
diff --git a/test/mirk/vectorofvector_initials.jl b/test/mirk/vectorofvector_initials.jl
deleted file mode 100644
index 79df5db7..00000000
--- a/test/mirk/vectorofvector_initials.jl
+++ /dev/null
@@ -1,70 +0,0 @@
-using BoundaryValueDiffEq, Test, OrdinaryDiffEq
-
-#System Constants
-ss = 1 #excitatory parameter
-sj = 0 #inhibitory parameter
-glb = 0.05
-el = -70
-gnab = 3
-ena = 50
-gkb = 5
-ek = -90
-gtb = 2
-et = 90
-gex = 0.1
-vex = 0
-gsyn = 0.13
-vsyn = -85
-iext = 0.41
-eps = 1
-qht = 2.5
-
-#System functions
-function f(v, h, r)
-    -(glb * (v - el) + gnab * (1 / (1 + exp(-(v + 37) / 7)))^3 * h * (v - ena) +
-      gkb * (0.75 * (1 - h))^4 * (v - ek) +
-      gtb * (1 / (1 + exp(-(v + 60) / 6.2)))^2 * r * (v - et)) - gex * ss * (v - vex) -
-    gsyn * sj * (v - vsyn) + iext
-end
-
-function g(v, h)
-    eps * ((1 / (1 + exp((v + 41) / 4))) - h) /
-    (1 / ((0.128 * exp(-(v + 46) / 18)) + (4 / (1 + exp(-(v + 23) / 5)))))
-end
-
-function k(v, r)
-    qht * ((1 / (1 + exp((v + 84) / 4))) - r) / ((28 + exp(-(v + 25) / 10.5)))
-end
-
-#Dynamical System
-function TC!(du, u, p, t)
-    v, h, r = u
-
-    du[1] = dv = f(v, h, r)
-    du[2] = dh = g(v, h)
-    du[3] = dr = k(v, r)
-end
-
-#Finding initial guesses by forward integration
-T = 7.588145762136627 #orbit length
-u0 = [-40.296570996984855, 0.7298857398191566, 0.0011680534089275774]
-tspan = (0.0, T)
-prob = ODEProblem(TC!, u0, tspan, dt = 0.01)
-sol = solve(prob, Rodas4P(), reltol = 1e-12, abstol = 1e-12, saveat = 0.5)
-
-# The BVP set up
-# This is not really kind of Two-Point BVP we support.
-function bc_po!(residual, u, p, t)
-    residual[1] = u[1][1] - u[end][1]
-    residual[2] = u[1][2] - u[end][2]
-    residual[3] = u[1][3] - u[end][3]
-end
-
-#This is the part of the code that has problems
-bvp1 = BVProblem(TC!, bc_po!, sol.u, tspan)
-sol6 = solve(bvp1, MIRK6(); dt = 0.5)
-@test SciMLBase.successful_retcode(sol6.retcode)
-
-bvp1 = BVProblem(TC!, bc_po!, zero(first(sol.u)), tspan)
-sol6 = solve(bvp1, MIRK6(); dt = 0.1, abstol = 1e-15)
-@test SciMLBase.successful_retcode(sol6.retcode)
diff --git a/test/mirk/vectorofvector_initials_tests.jl b/test/mirk/vectorofvector_initials_tests.jl
new file mode 100644
index 00000000..7a2be9a7
--- /dev/null
+++ b/test/mirk/vectorofvector_initials_tests.jl
@@ -0,0 +1,71 @@
+@testitem "VectorOfVector Initial Condition" begin
+    #System Constants
+    ss = 1 #excitatory parameter
+    sj = 0 #inhibitory parameter
+    glb = 0.05
+    el = -70
+    gnab = 3
+    ena = 50
+    gkb = 5
+    ek = -90
+    gtb = 2
+    et = 90
+    gex = 0.1
+    vex = 0
+    gsyn = 0.13
+    vsyn = -85
+    iext = 0.41
+    eps = 1
+    qht = 2.5
+
+    #System functions
+    function f(v, h, r)
+        -(glb * (v - el) +
+          gnab * (1 / (1 + exp(-(v + 37) / 7)))^3 * h * (v - ena) +
+          gkb * (0.75 * (1 - h))^4 * (v - ek) +
+          gtb * (1 / (1 + exp(-(v + 60) / 6.2)))^2 * r * (v - et)) - gex * ss * (v - vex) -
+        gsyn * sj * (v - vsyn) + iext
+    end
+
+    function g(v, h)
+        eps * ((1 / (1 + exp((v + 41) / 4))) - h) /
+        (1 / ((0.128 * exp(-(v + 46) / 18)) + (4 / (1 + exp(-(v + 23) / 5)))))
+    end
+
+    function k(v, r)
+        qht * ((1 / (1 + exp((v + 84) / 4))) - r) / ((28 + exp(-(v + 25) / 10.5)))
+    end
+
+    #Dynamical System
+    function TC!(du, u, p, t)
+        v, h, r = u
+
+        du[1] = dv = f(v, h, r)
+        du[2] = dh = g(v, h)
+        du[3] = dr = k(v, r)
+    end
+
+    #Finding initial guesses by forward integration
+    T = 7.588145762136627 #orbit length
+    u0 = [-40.296570996984855, 0.7298857398191566, 0.0011680534089275774]
+    tspan = (0.0, T)
+    prob = ODEProblem(TC!, u0, tspan, dt = 0.01)
+    sol = solve(prob, Rodas4P(), reltol = 1e-12, abstol = 1e-12, saveat = 0.5)
+
+    # The BVP set up
+    # This is not really kind of Two-Point BVP we support.
+    function bc_po!(residual, u, p, t)
+        residual[1] = u[1][1] - u[end][1]
+        residual[2] = u[1][2] - u[end][2]
+        residual[3] = u[1][3] - u[end][3]
+    end
+
+    #This is the part of the code that has problems
+    bvp1 = BVProblem(TC!, bc_po!, sol.u, tspan)
+    sol6 = solve(bvp1, MIRK6(); dt = 0.5)
+    @test SciMLBase.successful_retcode(sol6.retcode)
+
+    bvp1 = BVProblem(TC!, bc_po!, zero(first(sol.u)), tspan)
+    sol6 = solve(bvp1, MIRK6(); dt = 0.1, abstol = 1e-15)
+    @test SciMLBase.successful_retcode(sol6.retcode)
+end
diff --git a/test/misc/affine_geodesic.jl b/test/misc/affine_geodesic.jl
deleted file mode 100644
index 69fcc938..00000000
--- a/test/misc/affine_geodesic.jl
+++ /dev/null
@@ -1,51 +0,0 @@
-using BoundaryValueDiffEq
-
-struct EmbeddedTorus
-    R::Float64
-    r::Float64
-end
-
-function affine_connection!(M::EmbeddedTorus, Zc, i, a, Xc, Yc)
-    θ = a[1] .+ i[1]
-    sinθ, cosθ = sincos(θ)
-    Γ¹₂₂ = (M.R + M.r * cosθ) * sinθ / M.r
-    Γ²₁₂ = -M.r * sinθ / (M.R + M.r * cosθ)
-
-    Zc[1] = Xc[2] * Γ¹₂₂ * Yc[2]
-    Zc[2] = Γ²₁₂ * (Xc[1] * Yc[2] + Xc[2] * Yc[1])
-    return Zc
-end
-
-M = EmbeddedTorus(3, 2)
-a1 = [0.5, -1.2]
-a2 = [-0.5, 0.3]
-i = (0, 0)
-solver = MIRK4()
-dt = 0.05
-tspan = (0.0, 1.0)
-
-function bc1!(residual, u, p, t)
-    mid = div(length(u[1]), 2)
-    residual[1:mid] = u[1][1:mid] - a1
-    return residual[(mid + 1):end] = u[end][1:mid] - a2
-end
-
-function chart_log_problem!(du, u, params, t)
-    M, i = params
-    mid = div(length(u), 2)
-    a = u[1:mid]
-    dx = u[(mid + 1):end]
-    ddx = similar(dx)
-    affine_connection!(M, ddx, i, a, dx, dx)
-    ddx .*= -1
-    du[1:mid] .= dx
-    du[(mid + 1):end] .= ddx
-    return du
-end
-
-@testset "successful convergence" begin
-    u0 = [vcat(a1, zero(a1)), vcat(a2, zero(a1))]
-    bvp1 = BVProblem(chart_log_problem!, bc1!, u0, tspan, (M, i))
-    sol1 = solve(bvp1, solver, dt = dt)
-    @test SciMLBase.successful_retcode(sol1.retcode)
-end
diff --git a/test/misc/aqua.jl b/test/misc/aqua.jl
deleted file mode 100644
index 45a931bd..00000000
--- a/test/misc/aqua.jl
+++ /dev/null
@@ -1,6 +0,0 @@
-using Aqua, BoundaryValueDiffEq, Test
-
-@testset "All Tests (except Ambiguities)" begin
-    # Ambiguities are from downstream pacakges
-    Aqua.test_all(BoundaryValueDiffEq; ambiguities = false)
-end
diff --git a/test/misc/initial_guess.jl b/test/misc/initial_guess.jl
deleted file mode 100644
index c20284c0..00000000
--- a/test/misc/initial_guess.jl
+++ /dev/null
@@ -1,82 +0,0 @@
-using BoundaryValueDiffEq, OrdinaryDiffEq, Test, LinearAlgebra
-
-@testset "Initial Guess" begin
-    # Problem taken from https://github.com/SciML/BoundaryValueDiffEq.jl/issues/117#issuecomment-1780981510
-    function affine_connection(a, Xc, Yc)
-        MR = 3.0
-        Mr = 2.0
-        Zc = similar(Xc)
-        θ = a[1]
-        sinθ, cosθ = sincos(θ)
-        Γ¹₂₂ = (MR + Mr * cosθ) * sinθ / Mr
-        Γ²₁₂ = -Mr * sinθ / (MR + Mr * cosθ)
-
-        Zc[1] = Xc[2] * Γ¹₂₂ * Yc[2]
-        Zc[2] = Γ²₁₂ * (Xc[1] * Yc[2] + Xc[2] * Yc[1])
-        return Zc
-    end
-
-    function chart_log_problem!(du, u, p, t)
-        mid = div(length(u), 2)
-        a = u[1:mid]
-        dx = u[(mid + 1):end]
-        ddx = -affine_connection(a, dx, dx)
-        du[1:mid] .= dx
-        du[(mid + 1):end] .= ddx
-        return du
-    end
-
-    function bc1!(residual, u, p, t)
-        a1, a2 = p[1:2], p[3:4]
-        mid = div(length(u[1]), 2)
-        residual[1:mid] = u[1][1:mid] - a1
-        residual[(mid + 1):end] = u[end][1:mid] - a2
-        return residual
-    end
-
-    function initial_guess_1(p, t)
-        a1, a2 = p[1:2], p[3:4]
-        return vcat(t * a1 + (1 - t) * a2, zero(a1))
-    end
-
-    function initial_guess_2(t)
-        a1, a2 = [0.5, -1.2], [-0.5, 0.3]
-        return vcat(t * a1 + (1 - t) * a2, zero(a1))
-    end
-
-    dt = 0.05
-    p = [0.5, -1.2, -0.5, 0.3]
-    tspan = (0.0, 1.0)
-
-    bvp1 = BVProblem(chart_log_problem!, bc1!, initial_guess_1, tspan, p)
-
-    algs = [Shooting(Tsit5()), MultipleShooting(10, Tsit5()), MIRK4(), MIRK5(), MIRK6()]
-
-    for alg in algs
-        if alg isa Shooting || alg isa MultipleShooting
-            sol = solve(bvp1, alg)
-        else
-            sol = solve(bvp1, alg; dt)
-        end
-        @test SciMLBase.successful_retcode(sol)
-        resid = zeros(4)
-        bc1!(resid, sol, p, sol.t)
-        @test norm(resid, Inf) < 1e-10
-    end
-
-    bvp2 = BVProblem(chart_log_problem!, bc1!, initial_guess_2, tspan, p)
-
-    for alg in algs
-        if alg isa Shooting || alg isa MultipleShooting
-            sol = solve(bvp2, alg)
-            @test_deprecated solve(bvp2, alg)
-        else
-            sol = solve(bvp2, alg; dt)
-            @test_deprecated solve(bvp2, alg; dt)
-        end
-        @test SciMLBase.successful_retcode(sol)
-        resid = zeros(4)
-        bc1!(resid, sol, p, sol.t)
-        @test norm(resid, Inf) < 1e-10
-    end
-end
diff --git a/test/misc/manifolds_tests.jl b/test/misc/manifolds_tests.jl
new file mode 100644
index 00000000..fbc532c2
--- /dev/null
+++ b/test/misc/manifolds_tests.jl
@@ -0,0 +1,83 @@
+@testitem "Manifolds.jl Integration" begin
+    using LinearAlgebra
+
+    struct EmbeddedTorus
+        R::Float64
+        r::Float64
+    end
+
+    function affine_connection!(M::EmbeddedTorus, Zc, i, a, Xc, Yc)
+        θ = a[1] .+ i[1]
+        sinθ, cosθ = sincos(θ)
+        Γ¹₂₂ = (M.R + M.r * cosθ) * sinθ / M.r
+        Γ²₁₂ = -M.r * sinθ / (M.R + M.r * cosθ)
+
+        Zc[1] = Xc[2] * Γ¹₂₂ * Yc[2]
+        Zc[2] = Γ²₁₂ * (Xc[1] * Yc[2] + Xc[2] * Yc[1])
+        return Zc
+    end
+
+    M = EmbeddedTorus(3, 2)
+    a1 = [0.5, -1.2]
+    a2 = [-0.5, 0.3]
+    i = (0, 0)
+    solver = MIRK4()
+    dt = 0.05
+    tspan = (0.0, 1.0)
+
+    function bc1!(residual, u, params, t)
+        M, i, a1, a2 = params
+        mid = div(length(u[1]), 2)
+        residual[1:mid] = u[1][1:mid] - a1
+        residual[(mid + 1):end] = u[end][1:mid] - a2
+        return
+    end
+
+    function chart_log_problem!(du, u, params, t)
+        M, i, a1, a2 = params
+        mid = div(length(u), 2)
+        a = u[1:mid]
+        dx = u[(mid + 1):end]
+        ddx = similar(dx)
+        affine_connection!(M, ddx, i, a, dx, dx)
+        ddx .*= -1
+        du[1:mid] .= dx
+        du[(mid + 1):end] .= ddx
+        return du
+    end
+
+    @testset "Successful Convergence" begin
+        u0 = [vcat(a1, zero(a1)), vcat(a2, zero(a1))]
+        bvp1 = BVProblem(chart_log_problem!, bc1!, u0, tspan, (M, i, a1, a2))
+        sol1 = solve(bvp1, solver, dt = dt)
+        @test SciMLBase.successful_retcode(sol1.retcode)
+    end
+
+    function initial_guess_1(p, t)
+        _, _, a1, a2 = p
+        return vcat(t * a1 + (1 - t) * a2, zero(a1))
+    end
+
+    function initial_guess_2(t)
+        a1, a2 = [0.5, -1.2], [-0.5, 0.3]
+        return vcat(t * a1 + (1 - t) * a2, zero(a1))
+    end
+
+    algs = [Shooting(Tsit5()), MultipleShooting(10, Tsit5()), MIRK4(), MIRK5(), MIRK6()]
+
+    @testset "Initial Guess Functions: $(u0)" for u0 in (initial_guess_1, initial_guess_2)
+        bvp = BVProblem(chart_log_problem!, bc1!, u0, tspan, (M, i, a1, a2))
+
+        for alg in algs
+            if alg isa Shooting || alg isa MultipleShooting
+                sol = solve(bvp, alg)
+            else
+                sol = solve(bvp, alg; dt)
+            end
+            @test SciMLBase.successful_retcode(sol)
+            resid = zeros(4)
+            bc1!(resid, sol, (M, i, a1, a2), sol.t)
+            @test norm(resid, Inf) < 1e-10
+        end
+    end
+end
diff --git a/test/misc/non_vector_input_tests.jl b/test/misc/non_vector_input_tests.jl
new file mode 100644
index 00000000..a0ddcb06
--- /dev/null
+++ b/test/misc/non_vector_input_tests.jl
@@ -0,0 +1,71 @@
+@testitem "Non-Vector Inputs" begin
+    using LinearAlgebra
+
+    for order in (2, 3, 4, 5, 6)
+        s = Symbol("MIRK$(order)")
+        @eval mirk_solver(::Val{$order}) = $(s)()
+    end
+
+    function f1!(du, u, p, t)
+        du[1, 1] = u[1, 2]
+        du[1, 2] = 0
+    end
+
+    function f1(u, p, t)
+        return [u[1, 2] 0]
+    end
+
+    function boundary!(residual, u, p, t)
+        residual[1, 1] = u[1][1, 1] - 5
+        residual[1, 2] = u[end][1, 1]
+    end
+
+    function boundary_a!(resida, ua, p)
+        resida[1, 1] = ua[1, 1] - 5
+    end
+    function boundary_b!(residb, ub, p)
+        residb[1, 1] = ub[1, 1]
+    end
+
+    function boundary(u, p, t)
+        return [u[1][1, 1] - 5 u[end][1, 1]]
+    end
+
+    boundary_a = (ua, p) -> [ua[1, 1] - 5]
+    boundary_b = (ub, p) -> [ub[1, 1]]
+
+    tspan = (0.0, 5.0)
+    u0 = [5.0 -3.5]
+    probs = [BVProblem(f1!, boundary!, u0, tspan; nlls = Val(false)),
+        TwoPointBVProblem(f1!, (boundary_a!, boundary_b!), u0, tspan;
+            bcresid_prototype = (Array{Float64}(undef, 1, 1), Array{Float64}(undef, 1, 1)),
+            nlls = Val(false)),
+        BVProblem(f1, boundary, u0, tspan; nlls = Val(false)),
+        TwoPointBVProblem(f1, (boundary_a, boundary_b), u0, tspan; nlls = Val(false))]
+
+    @testset "Affineness" begin
+        @testset "MIRK$order" for order in (2, 3, 4, 5, 6)
+            for prob in probs
+                sol = solve(prob, mirk_solver(Val(order)); dt = 0.2)
+                @test norm(boundary(sol, prob.p, nothing), Inf) < 0.01
+            end
+        end
+
+        @testset "Single Shooting" begin
+            for prob in probs
+                sol = solve(prob, Shooting(Tsit5()))
+                @test norm(boundary(sol, prob.p, nothing), Inf) < 0.01
+            end
+        end
+
+        # FIXME: Add Multiple Shooting here once it supports non-vector inputs
+        @testset "Multiple Shooting" begin
+            for prob in probs
+                @test_broken begin
+                    sol = solve(prob, MultipleShooting(5, Tsit5()))
+                    norm(boundary(sol, prob.p, nothing), Inf) < 0.01
+                end
+            end
+        end
+    end
+end
diff --git a/test/misc/non_vector_inputs.jl b/test/misc/non_vector_inputs.jl
deleted file mode 100644
index 101ea232..00000000
--- a/test/misc/non_vector_inputs.jl
+++ /dev/null
@@ -1,62 +0,0 @@
-using BoundaryValueDiffEq, LinearAlgebra, OrdinaryDiffEq, Test
-
-for order in (2, 3, 4, 5, 6)
-    s = Symbol("MIRK$(order)")
-    @eval mirk_solver(::Val{$order}) = $(s)()
-end
-
-function f1!(du, u, p, t)
-    du[1, 1] = u[1, 2]
-    du[1, 2] = 0
-end
-
-function f1(u, p, t)
-    return [u[1, 2] 0]
-end
-
-function boundary!(residual, u, p, t)
-    residual[1, 1] = u[1][1, 1] - 5
-    residual[1, 2] = u[end][1, 1]
-end
-
-function boundary_a!(resida, ua, p)
-    resida[1, 1] = ua[1, 1] - 5
-end
-function boundary_b!(residb, ub, p)
-    residb[1, 1] = ub[1, 1]
-end
-
-function boundary(u, p, t)
-    return [u[1][1, 1] - 5 u[end][1, 1]]
-end
-
-boundary_a = (ua, p) -> [ua[1, 1] - 5]
-boundary_b = (ub, p) -> [ub[1, 1]]
-
-tspan = (0.0, 5.0)
-u0 = [5.0 -3.5]
-probs = [
-    BVProblem(f1!, boundary!, u0, tspan),
-    TwoPointBVProblem(f1!, (boundary_a!, boundary_b!), u0, tspan;
-        bcresid_prototype = (Array{Float64}(undef, 1, 1), Array{Float64}(undef, 1, 1))),
-    BVProblem(f1, boundary, u0, tspan),
-    TwoPointBVProblem(f1, (boundary_a, boundary_b), u0, tspan)
-];
-
-@testset "Affineness" begin
-    @testset "MIRK$order" for order in (2, 3, 4, 5, 6)
-        for prob in probs
-            @time sol = solve(prob, mirk_solver(Val(order)); dt = 0.2)
-            @test norm(diff(first.(sol.u)) .+ 0.2, Inf) + abs(sol[1][1] - 5) < 0.01
-        end
-    end
-
-    @testset "Single Shooting" begin
-        for prob in probs
-            @time sol = solve(prob, Shooting(Tsit5()))
-            @test norm(boundary(sol, prob.p, nothing), Inf) < 0.01
-        end
-    end
-
-    # FIXME: Add Multiple Shooting here once it supports non-vector inputs
-end
diff --git a/test/misc/odeinterface_wrapper.jl b/test/misc/odeinterface_wrapper.jl
deleted file mode 100644
index ead00b84..00000000
--- a/test/misc/odeinterface_wrapper.jl
+++ /dev/null
@@ -1,74 +0,0 @@
-using Test, BoundaryValueDiffEq, LinearAlgebra, ODEInterface, Random
-
-# Adaptation of https://github.com/luchr/ODEInterface.jl/blob/958b6023d1dabf775033d0b89c5401b33100bca3/examples/BasicExamples/ex7.jl
-function ex7_f!(du, u, p, t)
-    ϵ = p[1]
-    u₁, λ = u
-    du[1] = (sin(t)^2 + λ * sin(t)^4 / u₁) / ϵ
-    du[2] = 0
-    return nothing
-end
-
-function ex7_2pbc1!(resa, ua, p)
-    resa[1] = ua[1] - 1
-    return nothing
-end
-
-function ex7_2pbc2!(resb, ub, p)
-    resb[1] = ub[1] - 1
-    return nothing
-end
-
-u0 = [0.5, 1.0]
-p = [0.1]
-tspan = (-π / 2, π / 2)
-
-tpprob = TwoPointBVProblem(ex7_f!, (ex7_2pbc1!, ex7_2pbc2!), u0, tspan, p;
-    bcresid_prototype = (zeros(1), zeros(1)))
-
-@info "BVPM2"
-
-sol_bvpm2 = solve(tpprob, BVPM2(); dt = π / 20)
-@test SciMLBase.successful_retcode(sol_bvpm2)
-resid_f = (Array{Float64, 1}(undef, 1), Array{Float64, 1}(undef, 1))
-ex7_2pbc1!(resid_f[1], sol_bvpm2(tspan[1]), nothing)
-ex7_2pbc2!(resid_f[2], sol_bvpm2(tspan[2]), nothing)
-@test norm(resid_f, Inf) < 1e-6
-
-function ex7_f2!(du, u, p, t)
-    u₁, λ = u
-    du[1] = (sin(t)^2 + λ * sin(t)^4 / u₁) / 0.1
-    du[2] = 0
-    return nothing
-end
-
-@info "BVPSOL"
-
-initial_u0 = [sol_bvpm2(t) .+ rand() for t in tspan[1]:(π / 20):tspan[2]]
-tpprob = TwoPointBVProblem(ex7_f2!, (ex7_2pbc1!, ex7_2pbc2!), initial_u0, tspan;
-    bcresid_prototype = (zeros(1), zeros(1)))
-
-# Just test that it runs. BVPSOL only works with linearly separable BCs.
-# TODO: Implement appolo reentry example from ODEInterface.jl
-sol_bvpsol = solve(tpprob, BVPSOL(); dt = π / 20)
-
-@info "COLNEW"
-
-function f!(du, u, p, t)
-    du[1] = u[2]
-    du[2] = u[1]
-end
-function bca!(resid_a, u_a, p)
-    resid_a[1] = u_a[1] - 1
-end
-function bcb!(resid_b, u_b, p)
-    resid_b[1] = u_b[1]
-end
-
-fun = BVPFunction(
-    f!, (bca!, bcb!), bcresid_prototype = (zeros(1), zeros(1)), twopoint = Val(true))
-tspan = (0.0, 1.0)
-
-prob = TwoPointBVProblem(fun, [1.0, 0.0], tspan)
-sol_colnew = solve(prob, COLNEW(), dt = 0.01)
-@test SciMLBase.successful_retcode(sol_colnew)
diff --git a/test/misc/qa_tests.jl b/test/misc/qa_tests.jl
new file mode 100644
index 00000000..e8ef3e30
--- /dev/null
+++ b/test/misc/qa_tests.jl
@@ -0,0 +1,6 @@
+@testitem "Quality Assurance" begin
+    using Aqua
+
+    Aqua.test_all(BoundaryValueDiffEq; ambiguities = false)
+    Aqua.test_ambiguities(BoundaryValueDiffEq; recursive = false)
+end
diff --git a/test/misc/type_stability.jl b/test/misc/type_stability.jl
deleted file mode 100644
index ef20e632..00000000
--- a/test/misc/type_stability.jl
+++ /dev/null
@@ -1,62 +0,0 @@
-using BoundaryValueDiffEq, OrdinaryDiffEq, LinearAlgebra, Test
-
-f(u, p, t) = [p[1] * u[1] - p[2] * u[1] * u[2], p[3] * u[1] * u[2] - p[4] * u[2]]
-function f!(du, u, p, t)
-    du[1] = p[1] * u[1] - p[2] * u[1] * u[2]
-    du[2] = p[3] * u[1] * u[2] - p[4] * u[2]
-end
-
-bc(sol, p, t) = [sol[1][1] - 1, sol[end][2] - 2]
-function bc!(res, sol, p, t)
-    res[1] = sol[1][1] - 1
-    res[2] = sol[end][2] - 2
-end
-twobc_a(ua, p) = [ua[1] - 1]
-twobc_b(ub, p) = [ub[2] - 2]
-twobc_a!(resa, ua, p) = (resa[1] = ua[1] - 1)
-twobc_b!(resb, ub, p) = (resb[1] = ub[2] - 2)
-
-u0 = Float64[0, 0]
-tspan = (0.0, 1.0)
-p = [1.0, 1.0, 1.0, 1.0]
-bcresid_prototype = (zeros(1), zeros(1))
-
-# Multi-Point BVP
-@testset "Multi-Point BVP" begin
-    mpbvp_iip = BVProblem(f!, bc!, u0, tspan, p)
-    mpbvp_oop = BVProblem(f, bc, u0, tspan, p)
-
-    @testset "Shooting Methods" begin
-        @inferred solve(mpbvp_iip, Shooting(Tsit5()))
-        @inferred solve(mpbvp_oop, Shooting(Tsit5()))
-        @inferred solve(mpbvp_iip, MultipleShooting(5, Tsit5()))
-        @inferred solve(mpbvp_oop, MultipleShooting(5, Tsit5()))
-    end
-
-    @testset "MIRK Methods" begin
-        for solver in (MIRK2(), MIRK3(), MIRK4(), MIRK5(), MIRK6())
-            @inferred solve(mpbvp_iip, solver; dt = 0.2)
-            @inferred solve(mpbvp_oop, solver; dt = 0.2)
-        end
-    end
-end
-
-# Two-Point BVP
-@testset "Two-Point BVP" begin
-    tpbvp_iip = TwoPointBVProblem(f!, (twobc_a!, twobc_b!), u0, tspan, p; bcresid_prototype)
-    tpbvp_oop = TwoPointBVProblem(f, (twobc_a, twobc_b), u0, tspan, p)
-
-    @testset "Shooting Methods" begin
-        @inferred solve(tpbvp_iip, Shooting(Tsit5()))
-        @inferred solve(tpbvp_oop, Shooting(Tsit5()))
-        @inferred solve(tpbvp_iip, MultipleShooting(5, Tsit5()))
-        @inferred solve(tpbvp_oop, MultipleShooting(5, Tsit5()))
-    end
-
-    @testset "MIRK Methods" begin
-        for solver in (MIRK2(), MIRK3(), MIRK4(), MIRK5(), MIRK6())
-            @inferred solve(tpbvp_iip, solver; dt = 0.2)
-            @inferred solve(tpbvp_oop, solver; dt = 0.2)
-        end
-    end
-end
diff --git a/test/misc/type_stability_tests.jl b/test/misc/type_stability_tests.jl
new file mode 100644
index 00000000..f05b1063
--- /dev/null
+++ b/test/misc/type_stability_tests.jl
@@ -0,0 +1,70 @@
+@testitem "Type Stability" begin
+    using LinearAlgebra
+
+    f(u, p, t) = [p[1] * u[1] - p[2] * u[1] * u[2], p[3] * u[1] * u[2] - p[4] * u[2]]
+    function f!(du, u, p, t)
+        du[1] = p[1] * u[1] - p[2] * u[1] * u[2]
+        du[2] = p[3] * u[1] * u[2] - p[4] * u[2]
+    end
+
+    bc(sol, p, t) = [sol[1][1] - 1, sol[end][2] - 2]
+    function bc!(res, sol, p, t)
+        res[1] = sol[1][1] - 1
+        res[2] = sol[end][2] - 2
+    end
+    twobc_a(ua, p) = [ua[1] - 1]
+    twobc_b(ub, p) = [ub[2] - 2]
+    twobc_a!(resa, ua, p) = (resa[1] = ua[1] - 1)
+    twobc_b!(resb, ub, p) = (resb[1] = ub[2] - 2)
+
+    u0 = Float64[0, 0]
+    tspan = (0.0, 1.0)
+    p = [1.0, 1.0, 1.0, 1.0]
+    bcresid_prototype = (zeros(1), zeros(1))
+
+    jac_alg = BVPJacobianAlgorithm(AutoForwardDiff(; chunksize = 2))
+
+    # Multi-Point BVP
+    @testset "Multi-Point BVP" begin
+        mpbvp_iip = BVProblem(f!, bc!, u0, tspan, p; nlls = Val(false))
+        mpbvp_oop = BVProblem(f, bc, u0, tspan, p; nlls = Val(false))
+
+        @testset "Shooting Methods" begin
+            @inferred solve(mpbvp_iip, Shooting(Tsit5(); jac_alg))
+            @inferred solve(mpbvp_oop, Shooting(Tsit5(); jac_alg))
+            @inferred solve(mpbvp_iip, MultipleShooting(5, Tsit5(); jac_alg))
+            @inferred solve(mpbvp_oop, MultipleShooting(5, Tsit5(); jac_alg))
+        end
+
+        @testset "MIRK Methods" begin
+            for solver in (MIRK2(; jac_alg), MIRK3(; jac_alg), MIRK4(; jac_alg),
+                MIRK5(; jac_alg), MIRK6(; jac_alg))
+                @inferred solve(mpbvp_iip, solver; dt = 0.2)
+                @inferred solve(mpbvp_oop, solver; dt = 0.2)
+            end
+        end
+    end
+
+    # Two-Point BVP
+    @testset "Two-Point BVP" begin
+        tpbvp_iip = TwoPointBVProblem(
+            f!, (twobc_a!, twobc_b!), u0, tspan, p; bcresid_prototype, nlls = Val(false))
+        tpbvp_oop = TwoPointBVProblem(
+            f, (twobc_a, twobc_b), u0, tspan, p; nlls = Val(false))
+
+        @testset "Shooting Methods" begin
+            @inferred solve(tpbvp_iip, Shooting(Tsit5(); jac_alg))
+            @inferred solve(tpbvp_oop, Shooting(Tsit5(); jac_alg))
+            @inferred solve(tpbvp_iip, MultipleShooting(5, Tsit5(); jac_alg))
+            @inferred solve(tpbvp_oop, MultipleShooting(5, Tsit5(); jac_alg))
+        end
+
+        @testset "MIRK Methods" begin
+            for solver in (MIRK2(; jac_alg), MIRK3(; jac_alg), MIRK4(; jac_alg),
+                MIRK5(; jac_alg), MIRK6(; jac_alg))
+                @inferred solve(tpbvp_iip, solver; dt = 0.2)
+                @inferred solve(tpbvp_oop, solver; dt = 0.2)
+            end
+        end
+    end
+end
diff --git a/test/runtests.jl b/test/runtests.jl
index efd5464a..88d0e890 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -1,71 +1,10 @@
-using Test, SafeTestsets
+using ReTestItems
 
-const GROUP = uppercase(get(ENV, "GROUP", "ALL"))
+ReTestItems.runtests(joinpath(@__DIR__, "mirk/"))
+ReTestItems.runtests(joinpath(@__DIR__, "misc/"))
+ReTestItems.runtests(joinpath(@__DIR__, "shooting/"))
 
-@testset "Boundary Value Problem Tests" begin
-    if GROUP == "ALL" || GROUP == "SHOOTING"
-        @time @testset "Shooting Method Tests" begin
-            @time @safetestset "Shooting Tests" begin
-                include("shooting/shooting_tests.jl")
-            end
-            @time @safetestset "Ray Tracing BVP" begin
-                include("shooting/ray_tracing.jl")
-            end
-            if VERSION ≥ v"1.10-"
-                @time @safetestset "Orbital" begin
-                    include("shooting/orbital.jl")
-                end
-            end
-            if VERSION ≥ v"1.10-"
-                @time @safetestset "Shooting NLLS Tests" begin
-                    include("shooting/nonlinear_least_squares.jl")
-                end
-            end
-        end
-    end
-
-    if GROUP == "ALL" || GROUP == "MIRK"
-        @time @testset "Collocation Method (MIRK) Tests" begin
-            @time @safetestset "Ensemble" begin
-                include("mirk/ensemble.jl")
-            end
-            @time @safetestset "MIRK Convergence Tests" begin
-                include("mirk/mirk_convergence_tests.jl")
-            end
-            @time @safetestset "Vector of Vector" begin
-                include("mirk/vectorofvector_initials.jl")
-            end
-            @time @safetestset "Interpolation Tests" begin
-                include("mirk/interpolation_test.jl")
-            end
-            if VERSION ≥ v"1.10-"
-                @time @safetestset "MIRK NLLS Tests" begin
-                    include("mirk/nonlinear_least_squares.jl")
-                end
-            end
-        end
-    end
-
-    if GROUP == "ALL" || GROUP == "OTHERS"
-        @time @testset "Miscelleneous" begin
-            @time @safetestset "Non Vector Inputs" begin
-                include("misc/non_vector_inputs.jl")
-            end
-            @time @safetestset "Type Stability" begin
-                include("misc/type_stability.jl")
-            end
-            @time @safetestset "ODE Interface Tests" begin
-                include("misc/odeinterface_wrapper.jl")
-            end
-            @time @safetestset "Initial Guess Function" begin
-                include("misc/initial_guess.jl")
-            end
-            @time @safetestset "Affine Geodesic" begin
-                include("misc/affine_geodesic.jl")
-            end
-            @time @safetestset "Aqua: Quality Assurance" begin
-                include("misc/aqua.jl")
-            end
-        end
-    end
+if !Sys.iswindows()
+    # Wrappers like ODEInterface don't support parallel testing
+    ReTestItems.runtests(joinpath(@__DIR__, "wrappers/"); nworkers = 0)
 end
diff --git a/test/shooting/basic_problems_tests.jl b/test/shooting/basic_problems_tests.jl
new file mode 100644
index 00000000..60c1d265
--- /dev/null
+++ b/test/shooting/basic_problems_tests.jl
@@ -0,0 +1,363 @@
+@testitem "Basic Shooting" begin
+    using LinearAlgebra, LinearSolve, JET
+
+    SOLVERS = [Shooting(Tsit5(), NewtonRaphson(; autodiff = AutoFiniteDiff())),
+        Shooting(Tsit5(), NewtonRaphson(; autodiff = AutoForwardDiff(; chunksize = 2))),
+        Shooting(Tsit5()),
+        MultipleShooting(10, Tsit5(), NewtonRaphson(; autodiff = AutoFiniteDiff())),
+        MultipleShooting(
+            10, Tsit5(), NewtonRaphson(; autodiff = AutoForwardDiff(; chunksize = 2))),
+        MultipleShooting(10, Tsit5())]
+    JET_SKIP = [false, false, true, false, false, true]
+    JET_BROKEN = [false, false, false, false, false, false]
+
+    tspan = (0.0, 100.0)
+    u0 = [0.0, 1.0]
+    # Inplace
+    function f1!(du, u, p, t)
+        du[1] = u[2]
+        du[2] = -u[1]
+        return nothing
+    end
+
+    function bc1!(resid, sol, p, t)
+        t₀, t₁ = first(t), last(t)
+        resid[1] = sol(t₀)[1]
+        resid[2] = sol(t₁)[1] - 1
+        return nothing
+    end
+
+    bvp1 = BVProblem(f1!, bc1!, u0, tspan)
+    @test SciMLBase.isinplace(bvp1)
+
+    for (i, solver) in enumerate(SOLVERS)
+        sol = solve(bvp1, solver; abstol = 1e-8, reltol = 1e-8,
+            odesolve_kwargs = (; abstol = 1e-6, reltol = 1e-3), maxiters = 10000)
+
+        @test SciMLBase.successful_retcode(sol)
+        @test norm(sol.resid, Inf) < 1e-8
+
+        JET_SKIP[i] && continue
+        @test_opt target_modules=(
+            SciMLBase, DiffEqBase, NonlinearSolve, BoundaryValueDiffEq) solve(
+            bvp1, solver; abstol = 1e-8, reltol = 1e-8,
+            odesolve_kwargs = (; abstol = 1e-6, reltol = 1e-3)) broken=JET_BROKEN[i]
+        @test_call target_modules=(
+            SciMLBase, DiffEqBase, NonlinearSolve, BoundaryValueDiffEq) solve(
+            bvp1, solver; abstol = 1e-8, reltol = 1e-8,
+            odesolve_kwargs = (; abstol = 1e-6, reltol = 1e-3)) broken=JET_BROKEN[i]
+    end
+
+    # Out of Place
+    f1(u, p, t) = [u[2], -u[1]]
+
+    function bc1(sol, p, t)
+        t₀, t₁ = first(t), last(t)
+        return [sol(t₀)[1], sol(t₁)[1] - 1]
+    end
+
+    @test_throws SciMLBase.NonconformingFunctionsError BVProblem(f1!, bc1, u0, tspan)
+    @test_throws SciMLBase.NonconformingFunctionsError BVProblem(f1, bc1!, u0, tspan)
+
+    bvp2 = BVProblem(f1, bc1, u0, tspan)
+    @test !SciMLBase.isinplace(bvp2)
+
+    for (i, solver) in enumerate(SOLVERS)
+        sol = solve(bvp2, solver; abstol = 1e-8, reltol = 1e-8,
+            odesolve_kwargs = (; abstol = 1e-6, reltol = 1e-3), maxiters = 10000)
+
+        @test SciMLBase.successful_retcode(sol)
+        @test norm(sol.resid, Inf) < 1e-8
+
+        JET_SKIP[i] && continue
+        @test_opt target_modules=(
+            SciMLBase, DiffEqBase, NonlinearSolve, BoundaryValueDiffEq) solve(
+            bvp2, solver; abstol = 1e-8, reltol = 1e-8,
+            odesolve_kwargs = (; abstol = 1e-6, reltol = 1e-3)) broken=JET_BROKEN[i]
+        @test_call target_modules=(
+            SciMLBase, DiffEqBase, NonlinearSolve, BoundaryValueDiffEq) solve(
+            bvp2, solver; abstol = 1e-8, reltol = 1e-8,
+            odesolve_kwargs = (; abstol = 1e-6, reltol = 1e-3)) broken=JET_BROKEN[i]
+    end
+
+    # Inplace
+    bc2a!(resid, ua, p) = (resid[1] = ua[1])
+    bc2b!(resid, ub, p) = (resid[1] = ub[1] - 1)
+
+    bvp3 = TwoPointBVProblem(f1!, (bc2a!, bc2b!), u0, tspan;
+        bcresid_prototype = (Array{Float64}(undef, 1), Array{Float64}(undef, 1)))
+    @test SciMLBase.isinplace(bvp3)
+
+    for (i, solver) in enumerate(SOLVERS)
+        sol = solve(bvp3, solver; abstol = 1e-8, reltol = 1e-8,
+            odesolve_kwargs = (; abstol = 1e-6, reltol = 1e-3), maxiters = 10000)
+
+        @test SciMLBase.successful_retcode(sol)
+        @test norm(sol.resid, Inf) < 1e-8
+
+        JET_SKIP[i] && continue
+        @test_opt target_modules=(
+            SciMLBase, DiffEqBase, NonlinearSolve, BoundaryValueDiffEq) solve(
+            bvp3, solver; abstol = 1e-8, reltol = 1e-8,
+            odesolve_kwargs = (; abstol = 1e-6, reltol = 1e-3)) broken=JET_BROKEN[i]
+        @test_call target_modules=(
+            SciMLBase, DiffEqBase, NonlinearSolve, BoundaryValueDiffEq) solve(
+            bvp3, solver; abstol = 1e-8, reltol = 1e-8,
+            odesolve_kwargs = (; abstol = 1e-6, reltol = 1e-3)) broken=JET_BROKEN[i]
+    end
+
+    # Out of Place
+    bc2a(ua, p) = [ua[1]]
+    bc2b(ub, p) = [ub[1] - 1]
+
+    bvp4 = TwoPointBVProblem(f1, (bc2a, bc2b), u0, tspan)
+    @test !SciMLBase.isinplace(bvp4)
+
+    for (i, solver) in enumerate(SOLVERS)
+        sol = solve(bvp4, solver; abstol = 1e-8, reltol = 1e-8,
+            odesolve_kwargs = (; abstol = 1e-6, reltol = 1e-3), maxiters = 10000)
+
+        @test SciMLBase.successful_retcode(sol)
+        @test norm(sol.resid, Inf) < 1e-8
+
+        JET_SKIP[i] && continue
+        @test_opt target_modules=(
+            SciMLBase, DiffEqBase, NonlinearSolve, BoundaryValueDiffEq) solve(
+            bvp4, solver; abstol = 1e-8, reltol = 1e-8,
+            odesolve_kwargs = (; abstol = 1e-6, reltol = 1e-3)) broken=JET_BROKEN[i]
+        @test_call target_modules=(
+            SciMLBase, DiffEqBase, NonlinearSolve, BoundaryValueDiffEq) solve(
+            bvp4, solver; abstol = 1e-8, reltol = 1e-8,
+            odesolve_kwargs = (; abstol = 1e-6, reltol = 1e-3)) broken=JET_BROKEN[i]
+    end
+end
+
+@testitem "Shooting with Complex Values" begin
+    using LinearAlgebra, LinearSolve
+
+    SOLVERS = [
+        Shooting(Vern7(), NewtonRaphson(; autodiff = AutoFiniteDiff())), Shooting(Vern7()),
+        MultipleShooting(10, Vern7(), NewtonRaphson(; autodiff = AutoFiniteDiff())),
+        MultipleShooting(10, Vern7())]
+
+    function f1!(du, u, p, t)
+        du[1] = u[2]
+        du[2] = -u[1]
+        return nothing
+    end
+
+    function bc1!(resid, sol, p, t)
+        t₀, t₁ = first(t), last(t)
+        resid[1] = sol(t₀)[1]
+        resid[2] = sol(t₁)[1] - 1
+        return nothing
+    end
+
+    tspan = (0.0, 100.0)
+    u0 = [0.0, 1.0] .+ 1im
+    bvp = BVProblem(f1!, bc1!, u0, tspan)
+
+    for (i, solver) in enumerate(SOLVERS)
+        sol = solve(bvp, solver; abstol = 1e-8, reltol = 1e-8,
+            odesolve_kwargs = (; abstol = 1e-6, reltol = 1e-3), maxiters = 10000)
+
+        @test SciMLBase.successful_retcode(sol)
+        @test norm(sol.resid, Inf) < 1e-8
+    end
+end
+
+@testitem "Flow in a Channel" begin
+    using LinearAlgebra, LinearSolve
+
+    function flow_in_a_channel!(du, u, p, t)
+        R, P = p
+        A, f′′, f′, f, h′, h, θ′, θ = u
+        du[1] = 0
+        du[2] = R * (f′^2 - f * f′′) - R * A
+        du[3] = f′′
+        du[4] = f′
+        du[5] = -R * f * h′ - 1
+        du[6] = h′
+        du[7] = -P * f * θ′
+        du[8] = θ′
+    end
+
+    function bc_flow!(resid, sol, p, tspan)
+        t₁, t₂ = extrema(tspan)
+        solₜ₁ = sol(t₁)
+        solₜ₂ = sol(t₂)
+        resid[1] = solₜ₁[4]
+        resid[2] = solₜ₁[3]
+        resid[3] = solₜ₂[4] - 1
+        resid[4] = solₜ₂[3]
+        resid[5] = solₜ₁[6]
+        resid[6] = solₜ₂[6]
+        resid[7] = solₜ₁[8]
+        resid[8] = solₜ₂[8] - 1
+    end
+
+    tspan = (0.0, 1.0)
+    p = [10.0, 7.0]
+    u0 = zeros(8)
+
+    flow_bvp = BVProblem{true}(flow_in_a_channel!, bc_flow!, u0, tspan, p)
+
+    for solver in [
+        Shooting(AutoTsit5(Rosenbrock23()),
+            NewtonRaphson(; autodiff = AutoForwardDiff(; chunksize = 8))),
+        MultipleShooting(10, AutoTsit5(Rosenbrock23()),
+            NewtonRaphson(; autodiff = AutoForwardDiff(; chunksize = 8)))]
+        sol = solve(flow_bvp, solver; abstol = 1e-8, reltol = 1e-8,
+            odesolve_kwargs = (; abstol = 1e-8, reltol = 1e-8))
+
+        @test SciMLBase.successful_retcode(sol)
+        @test norm(sol.resid, Inf) < 1e-8
+    end
+end
+
+@testitem "Ray Tracing" begin
+    using LinearAlgebra, LinearSolve
+
+    @inline v(x, y, z, p) = 1 / (4 + cos(p[1] * x) + sin(p[2] * y) - cos(p[3] * z))
+    @inline ux(x, y, z, p) = -p[1] * sin(p[1] * x)
+    @inline uy(x, y, z, p) = p[2] * cos(p[2] * y)
+    @inline uz(x, y, z, p) = p[3] * sin(p[3] * z)
+
+    function ray_tracing(u, p, t)
+        du = similar(u)
+        ray_tracing!(du, u, p, t)
+        return du
+    end
+
+    function ray_tracing!(du, u, p, t)
+        x, y, z, ξ, η, ζ, T, S = u
+
+        nu = v(x, y, z, p) # Velocity of a sound wave, function of space;
+        μx = ux(x, y, z, p) # ∂(slowness)/∂x, function of space
+        μy = uy(x, y, z, p) # ∂(slowness)/∂y, function of space
+        μz = uz(x, y, z, p) # ∂(slowness)/∂z, function of space
+
+        du[1] = S * nu * ξ
+        du[2] = S * nu * η
+        du[3] = S * nu * ζ
+
+        du[4] = S * μx
+        du[5] = S * μy
+        du[6] = S * μz
+
+        du[7] = S / nu
+        du[8] = 0
+
+        return nothing
+    end
+
+    function ray_tracing_bc(sol, p, t)
+        res = similar(first(sol))
+        ray_tracing_bc!(res, sol, p, t)
+        return res
+    end
+
+    function ray_tracing_bc!(res, sol, p, t)
+        ua = sol(0.0)
+        ub = sol(1.0)
+        nu = v(ua[1], ua[2], ua[3], p) # Velocity of a sound wave, function of space;
+
+        res[1] = ua[1] - p[4]
+        res[2] = ua[2] - p[5]
+        res[3] = ua[3] - p[6]
+        res[4] = ua[7]      # T(0) = 0
+        res[5] = ua[4]^2 + ua[5]^2 + ua[6]^2 - 1 / nu^2
+        res[6] = ub[1] - p[7]
+        res[7] = ub[2] - p[8]
+        res[8] = ub[3] - p[9]
+        return nothing
+    end
+
+    function ray_tracing_bc_a(ua, p)
+        resa = similar(ua, 5)
+        ray_tracing_bc_a!(resa, ua, p)
+        return resa
+    end
+
+    function ray_tracing_bc_a!(resa, ua, p)
+        nu = v(ua[1], ua[2], ua[3], p) # Velocity of a sound wave, function of space;
+
+        resa[1] = ua[1] - p[4]
+        resa[2] = ua[2] - p[5]
+        resa[3] = ua[3] - p[5]
+        resa[4] = ua[7]
+        resa[5] = ua[4]^2 + ua[5]^2 + ua[6]^2 - 1 / nu^2
+
+        return nothing
+    end
+
+    function ray_tracing_bc_b(ub, p)
+        resb = similar(ub, 3)
+        ray_tracing_bc_b!(resb, ub, p)
+        return resb
+    end
+
+    function ray_tracing_bc_b!(resb, ub, p)
+        resb[1] = ub[1] - p[7]
+        resb[2] = ub[2] - p[8]
+        resb[3] = ub[3] - p[9]
+        return nothing
+    end
+
+    p = [0, 1, 2, 0, 0, 0, 4, 3, 2.0]
+
+    dx = p[7] - p[4]
+    dy = p[8] - p[5]
+    dz = p[9] - p[6]
+
+    u0 = zeros(8)
+    u0[1:3] .= 0 # position
+    u0[4] = dx / v(p[4], p[5], p[6], p)
+    u0[5] = dy / v(p[4], p[5], p[6], p)
+    u0[6] = dz / v(p[4], p[5], p[6], p)
+    u0[8] = 1
+
+    tspan = (0.0, 1.0)
+
+    prob_oop = BVProblem(
+        BVPFunction{false}(ray_tracing, ray_tracing_bc), u0, tspan, p; nlls = Val(false))
+    prob_iip = BVProblem(
+        BVPFunction{true}(ray_tracing!, ray_tracing_bc!), u0, tspan, p; nlls = Val(true))
+    prob_tp_oop = BVProblem(
+        BVPFunction{false}(
+            ray_tracing, (ray_tracing_bc_a, ray_tracing_bc_b); twopoint = Val(true)),
+        u0,
+        tspan,
+        p;
+        nlls = Val(true))
+    prob_tp_iip = BVProblem(
+        BVPFunction{true}(ray_tracing!, (ray_tracing_bc_a!, ray_tracing_bc_b!);
+            bcresid_prototype = (zeros(5), zeros(3)), twopoint = Val(true)),
+        u0,
+        tspan,
+        p;
+        nlls = Val(true))
+
+    alg_sp = MultipleShooting(10,
+        AutoVern7(Rodas4P());
+        grid_coarsening = true,
+        nlsolve = TrustRegion(),
+        jac_alg = BVPJacobianAlgorithm(; bc_diffmode = AutoForwardDiff(; chunksize = 8),
+            nonbc_diffmode = AutoSparseForwardDiff(; chunksize = 8)))
+    alg_dense = MultipleShooting(10,
+        AutoVern7(Rodas4P());
+        grid_coarsening = true,
+        nlsolve = TrustRegion(),
+        jac_alg = BVPJacobianAlgorithm(; bc_diffmode = AutoForwardDiff(; chunksize = 8),
+            nonbc_diffmode = AutoForwardDiff(; chunksize = 8)))
+    alg_default = MultipleShooting(10, AutoVern7(Rodas4P()); grid_coarsening = true)
+
+    for (prob, alg) in Iterators.product(
+        (prob_oop, prob_iip, prob_tp_oop, prob_tp_iip), (alg_sp, alg_dense, alg_default))
+        sol = solve(prob, alg; abstol = 1e-6, reltol = 1e-6, maxiters = 1000,
+            odesolve_kwargs = (; abstol = 1e-8, reltol = 1e-5))
+
+        @test SciMLBase.successful_retcode(sol.retcode)
+        @test norm(sol.resid, Inf) < 1e-6
+    end
+end
diff --git a/test/shooting/nlls_tests.jl b/test/shooting/nlls_tests.jl
new file mode 100644
index 00000000..21aab1a9
--- /dev/null
+++ b/test/shooting/nlls_tests.jl
@@ -0,0 +1,154 @@
+@testitem "Overconstrained BVP" begin
+    using LinearAlgebra, JET
+
+    SOLVERS = [
+        Shooting(Tsit5(); jac_alg = BVPJacobianAlgorithm(AutoForwardDiff(; chunksize = 2))),
+        Shooting(
+            Tsit5(), LevenbergMarquardt(; autodiff = AutoForwardDiff(; chunksize = 2))),
+        Shooting(Tsit5(), LevenbergMarquardt(; autodiff = AutoFiniteDiff())),
+        Shooting(Tsit5(), GaussNewton(; autodiff = AutoForwardDiff(; chunksize = 2))),
+        Shooting(Tsit5(), GaussNewton(; autodiff = AutoFiniteDiff())),
+        Shooting(Tsit5(), TrustRegion(; autodiff = AutoForwardDiff(; chunksize = 2))),
+        Shooting(Tsit5(), TrustRegion(; autodiff = AutoFiniteDiff())),
+        MultipleShooting(
+            10, Tsit5(); jac_alg = BVPJacobianAlgorithm(AutoForwardDiff(; chunksize = 2))),
+        MultipleShooting(
+            10, Tsit5(), LevenbergMarquardt(; autodiff = AutoForwardDiff(; chunksize = 2))),
+        MultipleShooting(10, Tsit5(), LevenbergMarquardt(; autodiff = AutoFiniteDiff())),
+        MultipleShooting(
+            10, Tsit5(), GaussNewton(; autodiff = AutoForwardDiff(; chunksize = 2))),
+        MultipleShooting(10, Tsit5(), GaussNewton(; autodiff = AutoFiniteDiff())),
+        MultipleShooting(
+            10, Tsit5(), TrustRegion(; autodiff = AutoForwardDiff(; chunksize = 2))),
+        MultipleShooting(10, Tsit5(), TrustRegion(; autodiff = AutoFiniteDiff()))]
+    JET_SKIP = fill(false, length(SOLVERS))
+    JET_OPT_BROKEN = fill(false, length(SOLVERS))
+    JET_CALL_BROKEN = fill(false, length(SOLVERS))
+
+    # OOP MP-BVP
+    f1(u, p, t) = [u[2], -u[1]]
+
+    function bc1(sol, p, t)
+        t₁, t₂ = extrema(t)
+        solₜ₁ = sol(t₁)
+        solₜ₂ = sol(t₂)
+        solₜ₃ = sol((t₁ + t₂) / 2)
+        # We know that this overconstrained system has a solution
+        return [solₜ₁[1], solₜ₂[1] - 1, solₜ₃[1] - 0.51735, solₜ₃[2] + 1.92533]
+    end
+
+    tspan = (0.0, 100.0)
+    u0 = [0.0, 1.0]
+
+    bvp1 = BVProblem(BVPFunction{false}(f1, bc1; bcresid_prototype = zeros(4)),
+        u0, tspan; nlls = Val(true))
+
+    for (i, solver) in enumerate(SOLVERS)
+        sol = solve(bvp1, solver; verbose = false, abstol = 1e-6, reltol = 1e-6,
+            odesolve_kwargs = (; abstol = 1e-6, reltol = 1e-6))
+        @test norm(sol.resid, Inf) < 0.005
+
+        JET_SKIP[i] && continue
+        @test_opt target_modules=(
+            SciMLBase, DiffEqBase, NonlinearSolve, BoundaryValueDiffEq) solve(
+            bvp1, solver; verbose = false, abstol = 1e-6, reltol = 1e-6,
+            odesolve_kwargs = (; abstol = 1e-6, reltol = 1e-6)) broken=JET_OPT_BROKEN[i]
+        @test_call target_modules=(
+            SciMLBase, DiffEqBase, NonlinearSolve, BoundaryValueDiffEq) solve(
+            bvp1, solver; verbose = false, abstol = 1e-6, reltol = 1e-6,
+            odesolve_kwargs = (; abstol = 1e-6, reltol = 1e-6)) broken=JET_CALL_BROKEN[i]
+    end
+
+    # IIP MP-BVP
+    function f1!(du, u, p, t)
+        du[1] = u[2]
+        du[2] = -u[1]
+        return nothing
+    end
+
+    function bc1!(resid, sol, p, t)
+        (t₁, t₂) = extrema(t)
+        solₜ₁ = sol(t₁)
+        solₜ₂ = sol(t₂)
+        solₜ₃ = sol((t₁ + t₂) / 2)
+        # We know that this overconstrained system has a solution
+        resid[1] = solₜ₁[1]
+        resid[2] = solₜ₂[1] - 1
+        resid[3] = solₜ₃[1] - 0.51735
+        resid[4] = solₜ₃[2] + 1.92533
+        return nothing
+    end
+
+    bvp2 = BVProblem(BVPFunction{true}(f1!, bc1!; bcresid_prototype = zeros(4)),
+        u0, tspan; nlls = Val(true))
+
+    for (i, solver) in enumerate(SOLVERS)
+        sol = solve(bvp2, solver; verbose = false, abstol = 1e-6, reltol = 1e-6,
+            odesolve_kwargs = (; abstol = 1e-6, reltol = 1e-6))
+        @test norm(sol.resid, Inf) < 0.005
+
+        JET_SKIP[i] && continue
+        @test_opt target_modules=(
+            SciMLBase, DiffEqBase, NonlinearSolve, BoundaryValueDiffEq) solve(
+            bvp2, solver; verbose = false, abstol = 1e-6, reltol = 1e-6,
+            odesolve_kwargs = (; abstol = 1e-6, reltol = 1e-6)) broken=JET_OPT_BROKEN[i]
+        @test_call target_modules=(
+            SciMLBase, DiffEqBase, NonlinearSolve, BoundaryValueDiffEq) solve(
+            bvp2, solver; verbose = false, abstol = 1e-6, reltol = 1e-6,
+            odesolve_kwargs = (; abstol = 1e-6, reltol = 1e-6)) broken=JET_CALL_BROKEN[i]
+    end
+
+    # OOP TP-BVP
+    bc1a(ua, p) = [ua[1]]
+    bc1b(ub, p) = [ub[1] - 1, ub[2] + 1.729109]
+
+    bvp3 = BVProblem(
+        BVPFunction{false}(f1, (bc1a, bc1b); twopoint = Val(true),
+            bcresid_prototype = (zeros(1), zeros(2))),
+        u0,
+        tspan;
+        nlls = Val(true))
+
+    for (i, solver) in enumerate(SOLVERS)
+        sol = solve(bvp3, solver; verbose = false, abstol = 1e-6, reltol = 1e-6,
+            odesolve_kwargs = (; abstol = 1e-6, reltol = 1e-6))
+        @test norm(sol.resid, Inf) < 0.009
+
+        JET_SKIP[i] && continue
+        @test_opt target_modules=(
+            SciMLBase, DiffEqBase, NonlinearSolve, BoundaryValueDiffEq) solve(
+            bvp3, solver; verbose = false, abstol = 1e-6, reltol = 1e-6,
+            odesolve_kwargs = (; abstol = 1e-6, reltol = 1e-6)) broken=JET_OPT_BROKEN[i]
+        @test_call target_modules=(
+            SciMLBase, DiffEqBase, NonlinearSolve, BoundaryValueDiffEq) solve(
+            bvp3, solver; verbose = false, abstol = 1e-6, reltol = 1e-6,
+            odesolve_kwargs = (; abstol = 1e-6, reltol = 1e-6)) broken=JET_CALL_BROKEN[i]
+    end
+
+    # IIP TP-BVP
+    bc1a!(resid, ua, p) = (resid[1] = ua[1])
+    bc1b!(resid, ub, p) = (resid[1] = ub[1] - 1; resid[2] = ub[2] + 1.729109)
+
+    bvp4 = BVProblem(
+        BVPFunction{true}(f1!, (bc1a!, bc1b!); twopoint = Val(true),
+            bcresid_prototype = (zeros(1), zeros(2))),
+        u0,
+        tspan;
+        nlls = Val(true))
+
+    for (i, solver) in enumerate(SOLVERS)
+        sol = solve(bvp4, solver; verbose = false, abstol = 1e-6, reltol = 1e-6,
+            odesolve_kwargs = (; abstol = 1e-6, reltol = 1e-6))
+        @test norm(sol.resid, Inf) < 0.009
+
+        JET_SKIP[i] && continue
+        @test_opt target_modules=(
+            SciMLBase, DiffEqBase, NonlinearSolve, BoundaryValueDiffEq) solve(
+            bvp4, solver; verbose = false, abstol = 1e-6, reltol = 1e-6,
+            odesolve_kwargs = (; abstol = 1e-6, reltol = 1e-6)) broken=JET_OPT_BROKEN[i]
+        @test_call target_modules=(
+            SciMLBase, DiffEqBase, NonlinearSolve, BoundaryValueDiffEq) solve(
+            bvp4, solver; verbose = false, abstol = 1e-6, reltol = 1e-6,
+            odesolve_kwargs = (; abstol = 1e-6, reltol = 1e-6)) broken=JET_CALL_BROKEN[i]
+    end
+end
diff --git a/test/shooting/nonlinear_least_squares.jl b/test/shooting/nonlinear_least_squares.jl
deleted file mode 100644
index d1142d46..00000000
--- a/test/shooting/nonlinear_least_squares.jl
+++ /dev/null
@@ -1,96 +0,0 @@
-using BoundaryValueDiffEq, LinearAlgebra, LinearSolve, OrdinaryDiffEq, Test
-
-@testset "Overconstrained BVP" begin
-    SOLVERS = [
-        Shooting(Tsit5()),
-        Shooting(Tsit5(); nlsolve = GaussNewton()),
-        Shooting(Tsit5(); nlsolve = TrustRegion()),
-        MultipleShooting(10, Tsit5()),
-        MultipleShooting(10, Tsit5(); nlsolve = GaussNewton()),
-        MultipleShooting(10, Tsit5(); nlsolve = TrustRegion())]
-
-    # OOP MP-BVP
-    f1(u, p, t) = [u[2], -u[1]]
-
-    function bc1(sol, p, t)
-        t₁, t₂ = extrema(t)
-        solₜ₁ = sol(t₁)
-        solₜ₂ = sol(t₂)
-        solₜ₃ = sol((t₁ + t₂) / 2)
-        # We know that this overconstrained system has a solution
-        return [solₜ₁[1], solₜ₂[1] - 1, solₜ₃[1] - 0.51735, solₜ₃[2] + 1.92533]
-    end
-
-    tspan = (0.0, 100.0)
-    u0 = [0.0, 1.0]
-
-    bvp1 = BVProblem(BVPFunction{false}(f1, bc1; bcresid_prototype = zeros(4)), u0, tspan)
-
-    for solver in SOLVERS
-        sol = @time solve(bvp1, solver; verbose = false)
-        @test norm(bc1(sol, nothing, sol.t), Inf) < 1e-4
-    end
-
-    # IIP MP-BVP
-    function f1!(du, u, p, t)
-        du[1] = u[2]
-        du[2] = -u[1]
-        return nothing
-    end
-
-    function bc1!(resid, sol, p, t)
-        (t₁, t₂) = extrema(t)
-        solₜ₁ = sol(t₁)
-        solₜ₂ = sol(t₂)
-        solₜ₃ = sol((t₁ + t₂) / 2)
-        # We know that this overconstrained system has a solution
-        resid[1] = solₜ₁[1]
-        resid[2] = solₜ₂[1] - 1
-        resid[3] = solₜ₃[1] - 0.51735
-        resid[4] = solₜ₃[2] + 1.92533
-        return nothing
-    end
-
-    bvp2 = BVProblem(BVPFunction{true}(f1!, bc1!; bcresid_prototype = zeros(4)), u0, tspan)
-
-    for solver in SOLVERS
-        sol = @time solve(bvp2, solver; verbose = false)
-        resid_f = Array{Float64}(undef, 4)
-        bc1!(resid_f, sol, nothing, sol.t)
-        @test norm(resid_f, Inf) < 1e-4
-    end
-
-    # OOP TP-BVP
-    bc1a(ua, p) = [ua[1]]
-    bc1b(ub, p) = [ub[1] - 1, ub[2] + 1.729109]
-
-    bvp3 = TwoPointBVProblem(
-        BVPFunction{false}(f1, (bc1a, bc1b); twopoint = Val(true),
-            bcresid_prototype = (zeros(1), zeros(2))),
-        u0,
-        tspan)
-
-    for solver in SOLVERS
-        sol = @time solve(bvp3, solver; verbose = false)
-        @test norm(vcat(bc1a(sol(0.0), nothing), bc1b(sol(100.0), nothing)), Inf) < 1e-4
-    end
-
-    # IIP TP-BVP
-    bc1a!(resid, ua, p) = (resid[1] = ua[1])
-    bc1b!(resid, ub, p) = (resid[1] = ub[1] - 1; resid[2] = ub[2] + 1.729109)
-
-    bvp4 = TwoPointBVProblem(
-        BVPFunction{true}(f1!, (bc1a!, bc1b!); twopoint = Val(true),
-            bcresid_prototype = (zeros(1), zeros(2))),
-        u0,
-        tspan)
-
-    for solver in SOLVERS
-        sol = @time solve(bvp4, solver; verbose = false)
-        resida = Array{Float64}(undef, 1)
-        residb = Array{Float64}(undef, 2)
-        bc1a!(resida, sol(0.0), nothing)
-        bc1b!(residb, sol(100.0), nothing)
-        @test norm(vcat(resida, residb), Inf) < 1e-4
-    end
-end
diff --git a/test/shooting/orbital.jl b/test/shooting/orbital.jl
deleted file mode 100644
index ff4337d9..00000000
--- a/test/shooting/orbital.jl
+++ /dev/null
@@ -1,112 +0,0 @@
-# Lambert's Problem
-using BoundaryValueDiffEq, OrdinaryDiffEq, LinearAlgebra, Test
-
-y0 = [
-    -4.7763169762853989E+06,
-    -3.8386398704441520E+05,
-    -5.3500183933132319E+06,
-    -5528.612564911408,
-    1216.8442360202787,
-    4845.114446429901
-]
-init_val = [
-    -4.7763169762853989E+06,
-    -3.8386398704441520E+05,
-    -5.3500183933132319E+06,
-    7.0526926403748598E+06,
-    -7.9650476230388973E+05,
-    -1.1911128863666430E+06
-]
-J2 = 1.08262668E-3
-req = 6378137
-myu = 398600.4418E+9
-t0 = 86400 * 2.3577475462484435E+04
-t1 = 86400 * 2.3577522023524125E+04
-tspan = (t0, t1)
-
-# ODE solver
-function orbital!(dy, y, p, t)
-    r2 = (y[1]^2 + y[2]^2 + y[3]^2)
-    r3 = r2^(3 / 2)
-    w = 1 + 1.5J2 * (req * req / r2) * (1 - 5y[3] * y[3] / r2)
-    w2 = 1 + 1.5J2 * (req * req / r2) * (3 - 5y[3] * y[3] / r2)
-    dy[1] = y[4]
-    dy[2] = y[5]
-    dy[3] = y[6]
-    dy[4] = -myu * y[1] * w / r3
-    dy[5] = -myu * y[2] * w / r3
-    dy[6] = -myu * y[3] * w2 / r3
-end
-
-function bc!_generator(resid, sol, init_val)
-    resid[1] = sol(t0)[1] - init_val[1]
-    resid[2] = sol(t0)[2] - init_val[2]
-    resid[3] = sol(t0)[3] - init_val[3]
-    resid[4] = sol(t1)[1] - init_val[4]
-    resid[5] = sol(t1)[2] - init_val[5]
-    resid[6] = sol(t1)[3] - init_val[6]
-end
-
-function bc!_generator_2p_a(resid0, ua, init_val)
-    resid0[1] = ua[1] - init_val[1]
-    resid0[2] = ua[2] - init_val[2]
-    resid0[3] = ua[3] - init_val[3]
-end
-function bc!_generator_2p_b(resid1, ub, init_val)
-    resid1[1] = ub[1] - init_val[4]
-    resid1[2] = ub[2] - init_val[5]
-    resid1[3] = ub[3] - init_val[6]
-end
-
-cur_bc! = (resid, sol, p, t) -> bc!_generator(resid, sol, init_val)
-cur_bc_2point_a! = (resid, sol, p) -> bc!_generator_2p_a(resid, sol, init_val)
-cur_bc_2point_b! = (resid, sol, p) -> bc!_generator_2p_b(resid, sol, init_val)
-resid_f = Array{Float64}(undef, 6)
-resid_f_2p = (Array{Float64, 1}(undef, 3), Array{Float64, 1}(undef, 3))
-
-### Now use the BVP solver to get closer
-bvp = BVProblem(orbital!, cur_bc!, y0, tspan)
-for autodiff in (
-    AutoForwardDiff(; chunksize = 6), AutoFiniteDiff(; fdtype = Val(:central)),
-    AutoSparseForwardDiff(; chunksize = 6), AutoFiniteDiff(; fdtype = Val(:forward)),
-    AutoSparseFiniteDiff())
-    nlsolve = TrustRegion(; autodiff)
-    @time sol = solve(bvp, Shooting(DP5(); nlsolve); force_dtmin = true, abstol = 1e-13,
-        reltol = 1e-13, verbose = false)
-    cur_bc!(resid_f, sol, nothing, sol.t)
-    @info "Single Shooting Lambert's Problem: $(norm(resid_f, Inf))"
-    @test norm(resid_f, Inf) < 0.005
-
-    # Older versions take too long on the first run
-    jac_alg = BVPJacobianAlgorithm(; nonbc_diffmode = autodiff)
-    @time sol = solve(bvp, MultipleShooting(10, DP5(); nlsolve, jac_alg);
-        force_dtmin = true, abstol = 1e-6, reltol = 1e-6, verbose = false)
-    cur_bc!(resid_f, sol, nothing, sol.t)
-    @info "Multiple Shooting Lambert's Problem: $(norm(resid_f, Inf))"
-    @test norm(resid_f, Inf) < 0.005
-end
-
-### Using the TwoPoint BVP Structure
-bvp = TwoPointBVProblem(orbital!, (cur_bc_2point_a!, cur_bc_2point_b!), y0, tspan;
-    bcresid_prototype = (Array{Float64}(undef, 3), Array{Float64}(undef, 3)))
-for autodiff in (
-    AutoForwardDiff(; chunksize = 6), AutoFiniteDiff(; fdtype = Val(:central)),
-    AutoSparseForwardDiff(; chunksize = 6), AutoFiniteDiff(; fdtype = Val(:forward)),
-    AutoSparseFiniteDiff())
-    nlsolve = TrustRegion(; autodiff)
-    @time sol = solve(bvp, Shooting(DP5(); nlsolve); force_dtmin = true, abstol = 1e-13,
-        reltol = 1e-13, verbose = false)
-    cur_bc_2point_a!(resid_f_2p[1], sol(t0), nothing)
-    cur_bc_2point_b!(resid_f_2p[2], sol(t1), nothing)
-    @info "Single Shooting Lambert's Problem: $(norm(reduce(vcat, resid_f_2p), Inf))"
-    @test norm(reduce(vcat, resid_f_2p), Inf) < 0.005
-
-    # Older versions take too long on the first run
-    jac_alg = BVPJacobianAlgorithm(; nonbc_diffmode = autodiff, bc_diffmode = autodiff)
-    @time sol = solve(bvp, MultipleShooting(10, DP5(); nlsolve, jac_alg);
-        force_dtmin = true, abstol = 1e-6, reltol = 1e-6, verbose = false)
-    cur_bc_2point_a!(resid_f_2p[1], sol(t0), nothing)
-    cur_bc_2point_b!(resid_f_2p[2], sol(t1), nothing)
-    @info "Multiple Shooting Lambert's Problem: $(norm(reduce(vcat, resid_f_2p), Inf))"
-    @test norm(reduce(vcat, resid_f_2p), Inf) < 0.005
-end
diff --git a/test/shooting/orbital_tests.jl b/test/shooting/orbital_tests.jl
new file mode 100644
index 00000000..26cf2f86
--- /dev/null
+++ b/test/shooting/orbital_tests.jl
@@ -0,0 +1,100 @@
+@testitem "Lambert's Problem" begin
+    using LinearAlgebra
+
+    y0 = [-4.7763169762853989E+06, -3.8386398704441520E+05, -5.3500183933132319E+06,
+        -5528.612564911408, 1216.8442360202787, 4845.114446429901]
+    init_val = [-4.7763169762853989E+06, -3.8386398704441520E+05, -5.3500183933132319E+06,
+        7.0526926403748598E+06, -7.9650476230388973E+05, -1.1911128863666430E+06]
+    J2 = 1.08262668E-3
+    req = 6378137
+    myu = 398600.4418E+9
+    t0 = 86400 * 2.3577475462484435E+04
+    t1 = 86400 * 2.3577522023524125E+04
+    tspan = (t0, t1)
+
+    # ODE solver
+    function orbital!(dy, y, p, t)
+        r2 = (y[1]^2 + y[2]^2 + y[3]^2)
+        r3 = r2^(3 / 2)
+        w = 1 + 1.5J2 * (req * req / r2) * (1 - 5y[3] * y[3] / r2)
+        w2 = 1 + 1.5J2 * (req * req / r2) * (3 - 5y[3] * y[3] / r2)
+        dy[1] = y[4]
+        dy[2] = y[5]
+        dy[3] = y[6]
+        dy[4] = -myu * y[1] * w / r3
+        dy[5] = -myu * y[2] * w / r3
+        dy[6] = -myu * y[3] * w2 / r3
+    end
+
+    function bc!_generator(resid, sol, init_val)
+        resid[1] = sol(t0)[1] - init_val[1]
+        resid[2] = sol(t0)[2] - init_val[2]
+        resid[3] = sol(t0)[3] - init_val[3]
+        resid[4] = sol(t1)[1] - init_val[4]
+        resid[5] = sol(t1)[2] - init_val[5]
+        resid[6] = sol(t1)[3] - init_val[6]
+    end
+
+    function bc!_generator_2p_a(resid0, ua, init_val)
+        resid0[1] = ua[1] - init_val[1]
+        resid0[2] = ua[2] - init_val[2]
+        resid0[3] = ua[3] - init_val[3]
+    end
+    function bc!_generator_2p_b(resid1, ub, init_val)
+        resid1[1] = ub[1] - init_val[4]
+        resid1[2] = ub[2] - init_val[5]
+        resid1[3] = ub[3] - init_val[6]
+    end
+
+    cur_bc! = (resid, sol, p, t) -> bc!_generator(resid, sol, init_val)
+    cur_bc_2point_a! = (resid, sol, p) -> bc!_generator_2p_a(resid, sol, init_val)
+    cur_bc_2point_b! = (resid, sol, p) -> bc!_generator_2p_b(resid, sol, init_val)
+
+    bvp = BVProblem(orbital!, cur_bc!, y0, tspan; nlls = Val(false))
+    for autodiff in (
+        AutoForwardDiff(; chunksize = 6), AutoFiniteDiff(; fdtype = Val(:central)),
+        AutoSparseForwardDiff(; chunksize = 6),
+        AutoFiniteDiff(; fdtype = Val(:forward)), AutoSparseFiniteDiff())
+        nlsolve = TrustRegion(; autodiff)
+
+        sol = solve(
+            bvp, Shooting(DP5(); nlsolve); force_dtmin = true, abstol = 1e-6, reltol = 1e-6,
+            verbose = false, odesolve_kwargs = (abstol = 1e-6, reltol = 1e-3))
+
+        @test SciMLBase.successful_retcode(sol)
+        @test norm(sol.resid, Inf) < 1e-6
+
+        jac_alg = BVPJacobianAlgorithm(; nonbc_diffmode = autodiff,
+            bc_diffmode = BoundaryValueDiffEq.__get_non_sparse_ad(autodiff))
+        sol = solve(bvp, MultipleShooting(10, DP5(); nlsolve, jac_alg);
+            force_dtmin = true, abstol = 1e-6, reltol = 1e-6,
+            verbose = false, odesolve_kwargs = (abstol = 1e-6, reltol = 1e-3))
+
+        @test SciMLBase.successful_retcode(sol)
+        @test norm(sol.resid, Inf) < 1e-6
+    end
+
+    bvp = TwoPointBVProblem(orbital!, (cur_bc_2point_a!, cur_bc_2point_b!), y0, tspan;
+        bcresid_prototype = (Array{Float64}(undef, 3), Array{Float64}(undef, 3)),
+        nlls = Val(false))
+    for autodiff in (AutoForwardDiff(; chunksize = 6), AutoSparseFiniteDiff(),
+        AutoFiniteDiff(; fdtype = Val(:central)),
+        AutoFiniteDiff(; fdtype = Val(:forward)), AutoSparseForwardDiff(; chunksize = 6))
+        nlsolve = TrustRegion(; autodiff)
+
+        sol = solve(
+            bvp, Shooting(DP5(); nlsolve); force_dtmin = true, abstol = 1e-6, reltol = 1e-6,
+            verbose = false, odesolve_kwargs = (abstol = 1e-6, reltol = 1e-3))
+
+        @test SciMLBase.successful_retcode(sol)
+        @test norm(sol.resid, Inf) < 1e-6
+
+        jac_alg = BVPJacobianAlgorithm(; nonbc_diffmode = autodiff, bc_diffmode = autodiff)
+        sol = solve(bvp, MultipleShooting(10, DP5(); nlsolve, jac_alg);
+            force_dtmin = true, abstol = 1e-6, reltol = 1e-6,
+            verbose = false, odesolve_kwargs = (abstol = 1e-6, reltol = 1e-3))
+
+        @test SciMLBase.successful_retcode(sol)
+        @test norm(sol.resid, Inf) < 1e-6
+    end
+end
diff --git a/test/shooting/ray_tracing.jl b/test/shooting/ray_tracing.jl
deleted file mode 100644
index d913c6f6..00000000
--- a/test/shooting/ray_tracing.jl
+++ /dev/null
@@ -1,134 +0,0 @@
-using BoundaryValueDiffEq, LinearAlgebra, OrdinaryDiffEq, Test
-
-@inline v(x, y, z, p) = 1 / (4 + cos(p[1] * x) + sin(p[2] * y) - cos(p[3] * z))
-@inline ux(x, y, z, p) = -p[1] * sin(p[1] * x)
-@inline uy(x, y, z, p) = p[2] * cos(p[2] * y)
-@inline uz(x, y, z, p) = p[3] * sin(p[3] * z)
-
-function ray_tracing(u, p, t)
-    du = similar(u)
-    ray_tracing!(du, u, p, t)
-    return du
-end
-
-function ray_tracing!(du, u, p, t)
-    x, y, z, ξ, η, ζ, T, S = u
-
-    nu = v(x, y, z, p) # Velocity of a sound wave, function of space;
-    μx = ux(x, y, z, p) # ∂(slowness)/∂x, function of space
-    μy = uy(x, y, z, p) # ∂(slowness)/∂y, function of space
-    μz = uz(x, y, z, p) # ∂(slowness)/∂z, function of space
-
-    du[1] = S * nu * ξ
-    du[2] = S * nu * η
-    du[3] = S * nu * ζ
-
-    du[4] = S * μx
-    du[5] = S * μy
-    du[6] = S * μz
-
-    du[7] = S / nu
-    du[8] = 0
-
-    return nothing
-end
-
-function ray_tracing_bc(sol, p, t)
-    res = similar(first(sol))
-    ray_tracing_bc!(res, sol, p, t)
-    return res
-end
-
-function ray_tracing_bc!(res, sol, p, t)
-    ua = sol(0.0)
-    ub = sol(1.0)
-    nu = v(ua[1], ua[2], ua[3], p) # Velocity of a sound wave, function of space;
-
-    res[1] = ua[1] - p[4]
-    res[2] = ua[2] - p[5]
-    res[3] = ua[3] - p[6]
-    res[4] = ua[7]      # T(0) = 0
-    res[5] = ua[4]^2 + ua[5]^2 + ua[6]^2 - 1 / nu^2
-    res[6] = ub[1] - p[7]
-    res[7] = ub[2] - p[8]
-    res[8] = ub[3] - p[9]
-    return nothing
-end
-
-function ray_tracing_bc_a(ua, p)
-    resa = similar(ua, 5)
-    ray_tracing_bc_a!(resa, ua, p)
-    return resa
-end
-
-function ray_tracing_bc_a!(resa, ua, p)
-    nu = v(ua[1], ua[2], ua[3], p) # Velocity of a sound wave, function of space;
-
-    resa[1] = ua[1] - p[4]
-    resa[2] = ua[2] - p[5]
-    resa[3] = ua[3] - p[5]
-    resa[4] = ua[7]
-    resa[5] = ua[4]^2 + ua[5]^2 + ua[6]^2 - 1 / nu^2
-
-    return nothing
-end
-
-function ray_tracing_bc_b(ub, p)
-    resb = similar(ub, 3)
-    ray_tracing_bc_b!(resb, ub, p)
-    return resb
-end
-
-function ray_tracing_bc_b!(resb, ub, p)
-    resb[1] = ub[1] - p[7]
-    resb[2] = ub[2] - p[8]
-    resb[3] = ub[3] - p[9]
-    return nothing
-end
-
-p = [0, 1, 2, 0, 0, 0, 4, 3, 2.0]
-
-dx = p[7] - p[4]
-dy = p[8] - p[5]
-dz = p[9] - p[6]
-
-u0 = zeros(8)
-u0[1:3] .= 0 # position
-u0[4] = dx / v(p[4], p[5], p[6], p)
-u0[5] = dy / v(p[4], p[5], p[6], p)
-u0[6] = dz / v(p[4], p[5], p[6], p)
-u0[8] = 1
-
-tspan = (0.0, 1.0)
-
-prob_oop = BVProblem{false}(ray_tracing, ray_tracing_bc, u0, tspan, p)
-prob_iip = BVProblem{true}(ray_tracing!, ray_tracing_bc!, u0, tspan, p)
-prob_tp_oop = TwoPointBVProblem{false}(ray_tracing, (ray_tracing_bc_a, ray_tracing_bc_b),
-    u0, tspan, p)
-prob_tp_iip = TwoPointBVProblem{true}(ray_tracing!, (ray_tracing_bc_a!, ray_tracing_bc_b!),
-    u0, tspan, p; bcresid_prototype = (zeros(5), zeros(3)))
-
-alg_sp = MultipleShooting(10, AutoVern7(Rodas4P()); grid_coarsening = true,
-    jac_alg = BVPJacobianAlgorithm(; bc_diffmode = AutoForwardDiff(),
-        nonbc_diffmode = AutoSparseForwardDiff()))
-alg_dense = MultipleShooting(10, AutoVern7(Rodas4P()); grid_coarsening = true,
-    jac_alg = BVPJacobianAlgorithm(; bc_diffmode = AutoForwardDiff(),
-        nonbc_diffmode = AutoForwardDiff()))
-alg_default = MultipleShooting(10, AutoVern7(Rodas4P()); grid_coarsening = true)
-
-for (prob, alg) in Iterators.product((prob_oop, prob_iip, prob_tp_oop, prob_tp_iip),
-    (alg_sp, alg_dense, alg_default))
-    @time sol = solve(prob, alg; abstol = 1e-9, reltol = 1e-9, maxiters = 1000)
-    @test SciMLBase.successful_retcode(sol.retcode)
-
-    if prob.problem_type isa TwoPointBVProblem
-        resida, residb = zeros(5), zeros(3)
-        ray_tracing_bc_a!(resida, sol.u[1], p)
-        ray_tracing_bc_b!(residb, sol.u[end], p)
-        @test norm(vcat(resida, residb), Inf) < 5e-5
-    else
-        resid = zeros(8)
-        ray_tracing_bc!(resid, sol, p, sol.t)
-        @test norm(resid, Inf) < 5e-5
-    end
-end
diff --git a/test/shooting/shooting_tests.jl b/test/shooting/shooting_tests.jl
deleted file mode 100644
index 10f58978..00000000
--- a/test/shooting/shooting_tests.jl
+++ /dev/null
@@ -1,175 +0,0 @@
-using BoundaryValueDiffEq, LinearAlgebra, LinearSolve, OrdinaryDiffEq, Test
-
-@testset "Basic Shooting Tests" begin
-    SOLVERS = [Shooting(Tsit5()), MultipleShooting(10, Tsit5())]
-
-    tspan = (0.0, 100.0)
-    u0 = [0.0, 1.0]
-
-    # Inplace
-    function f1!(du, u, p, t)
-        du[1] = u[2]
-        du[2] = -u[1]
-        return nothing
-    end
-
-    function bc1!(resid, sol, p, t)
-        t₀, t₁ = first(t), last(t)
-        resid[1] = sol(t₀)[1]
-        resid[2] = sol(t₁)[1] - 1
-        return nothing
-    end
-
-    bvp1 = BVProblem(f1!, bc1!, u0, tspan)
-    @test SciMLBase.isinplace(bvp1)
-    for solver in SOLVERS
-        resid_f = Array{Float64}(undef, 2)
-        sol = solve(bvp1, solver; abstol = 1e-13, reltol = 1e-13)
-        @test SciMLBase.successful_retcode(sol)
-        bc1!(resid_f, sol, nothing, sol.t)
-        @test norm(resid_f, Inf) < 1e-12
-    end
-
-    # Out of Place
-    f1(u, p, t) = [u[2], -u[1]]
-
-    function bc1(sol, p, t)
-        t₀, t₁ = first(t), last(t)
-        return [sol(t₀)[1], sol(t₁)[1] - 1]
-    end
-
-    @test_throws SciMLBase.NonconformingFunctionsError BVProblem(f1!, bc1, u0, tspan)
-    @test_throws SciMLBase.NonconformingFunctionsError BVProblem(f1, bc1!, u0, tspan)
-
-    bvp2 = BVProblem(f1, bc1, u0, tspan)
-    @test !SciMLBase.isinplace(bvp2)
-    for solver in SOLVERS
-        sol = solve(bvp2, solver; abstol = 1e-13, reltol = 1e-13)
-        @test SciMLBase.successful_retcode(sol)
-        resid_f = bc1(sol, nothing, sol.t)
-        @test norm(resid_f, Inf) < 1e-12
-    end
-
-    # Inplace
-    bc2a!(resid, ua, p) = (resid[1] = ua[1])
-    bc2b!(resid, ub, p) = (resid[1] = ub[1] - 1)
-
-    bvp3 = TwoPointBVProblem(f1!, (bc2a!, bc2b!), u0, tspan;
-        bcresid_prototype = (Array{Float64}(undef, 1), Array{Float64}(undef, 1)))
-    @test SciMLBase.isinplace(bvp3)
-    for solver in SOLVERS
-        sol = solve(bvp3, solver; abstol = 1e-13, reltol = 1e-13)
-        @test SciMLBase.successful_retcode(sol)
-        resid_f = (Array{Float64, 1}(undef, 1), Array{Float64, 1}(undef, 1))
-        bc2a!(resid_f[1], sol(tspan[1]), nothing)
-        bc2b!(resid_f[2], sol(tspan[2]), nothing)
-        @test norm(reduce(vcat, resid_f), Inf) < 1e-12
-    end
-
-    # Out of Place
-    bc2a(ua, p) = [ua[1]]
-    bc2b(ub, p) = [ub[1] - 1]
-
-    bvp4 = TwoPointBVProblem(f1, (bc2a, bc2b), u0, tspan)
-    @test !SciMLBase.isinplace(bvp4)
-    for solver in SOLVERS
-        sol = solve(bvp4, solver; abstol = 1e-13, reltol = 1e-13)
-        @test SciMLBase.successful_retcode(sol)
-        resid_f = reduce(vcat, (bc2a(sol(tspan[1]), nothing), bc2b(sol(tspan[2]), nothing)))
-        @test norm(resid_f, Inf) < 1e-12
-    end
-end
-
-@testset "Shooting with Complex Values" begin
-    # Test for complex values
-    function f1!(du, u, p, t)
-        du[1] = u[2]
-        du[2] = -u[1]
-        return nothing
-    end
-
-    function bc1!(resid, sol, p, t)
-        t₀, t₁ = first(t), last(t)
-        resid[1] = sol(t₀)[1]
-        resid[2] = sol(t₁)[1] - 1
-        return nothing
-    end
-
-    tspan = (0.0, 100.0)
-    u0 = [0.0, 1.0] .+ 1im
-    bvp = BVProblem(f1!, bc1!, u0, tspan)
-    resid_f = Array{ComplexF64}(undef, 2)
-
-    # We will automatically use FiniteDiff if we can't use dual numbers
-    # Auto Selecting default solvers for Complex Valued Problems supported by a version of 
-    # NonlinearSolve that is not yet compatible with BoundaryValueDiffEq
-    for solver in [Shooting(Tsit5(), NewtonRaphson()),
-        MultipleShooting(10, Tsit5(), nlsolve = NewtonRaphson())]
-        sol = solve(bvp, solver; abstol = 1e-13, reltol = 1e-13)
-        @test SciMLBase.successful_retcode(sol)
-        bc1!(resid_f, sol, nothing, sol.t)
-        @test norm(resid_f, Inf) < 1e-12
-    end
-end
-
-@testset "Flow In a Channel" begin
-    function flow_in_a_channel!(du, u, p, t)
-        R, P = p
-        A, f′′, f′, f, h′, h, θ′, θ = u
-        du[1] = 0
-        du[2] = R * (f′^2 - f * f′′) - R * A
-        du[3] = f′′
-        du[4] = f′
-        du[5] = -R * f * h′ - 1
-        du[6] = h′
-        du[7] = -P * f * θ′
-        du[8] = θ′
-    end
-
-    function bc_flow!(resid, sol, p, tspan)
-        t₁, t₂ = extrema(tspan)
-        solₜ₁ = sol(t₁)
-        solₜ₂ = sol(t₂)
-        resid[1] = solₜ₁[4]
-        resid[2] = solₜ₁[3]
-        resid[3] = solₜ₂[4] - 1
-        resid[4] = solₜ₂[3]
-        resid[5] = solₜ₁[6]
-        resid[6] = solₜ₂[6]
-        resid[7] = solₜ₁[8]
-        resid[8] = solₜ₂[8] - 1
-    end
-
-    tspan = (0.0, 1.0)
-    p = [10.0, 7.0]
-    u0 = zeros(8)
-
-    flow_bvp = BVProblem{true}(flow_in_a_channel!, bc_flow!, u0, tspan, p)
-
-    sol_shooting = solve(flow_bvp,
-        Shooting(AutoTsit5(Rosenbrock23()), NewtonRaphson());
-        maxiters = 100)
-    @test SciMLBase.successful_retcode(sol_shooting)
-
-    resid = zeros(8)
-    bc_flow!(resid, sol_shooting, p, sol_shooting.t)
-    @test norm(resid, Inf) < 1e-6
-
-    sol_msshooting = solve(flow_bvp,
-        MultipleShooting(10, AutoTsit5(Rosenbrock23()); nlsolve = NewtonRaphson());
-        maxiters = 100)
-    @test SciMLBase.successful_retcode(sol_msshooting)
-
-    resid = zeros(8)
-    bc_flow!(resid, sol_msshooting, p, sol_msshooting.t)
-    @test norm(resid, Inf) < 1e-6
-end
-
-@testset "Testing Deprecations" begin
-    @test_deprecated Shooting(Tsit5();
-        nlsolve = NewtonRaphson(; autodiff = AutoForwardDiff(chunksize = 2)))
-
-    alg = Shooting(Tsit5();
-        nlsolve = NewtonRaphson(; autodiff = AutoForwardDiff(chunksize = 2)))
-    @test alg.jac_alg.diffmode == AutoForwardDiff(chunksize = 2)
-end
diff --git a/test/wrappers/odeinterface_tests.jl b/test/wrappers/odeinterface_tests.jl
new file mode 100644
index 00000000..e6c5b680
--- /dev/null
+++ b/test/wrappers/odeinterface_tests.jl
@@ -0,0 +1,110 @@
+@testsetup module ODEInterfaceWrapperTestSetup
+
+using BoundaryValueDiffEq, LinearAlgebra, ODEInterface, Random, RecursiveArrayTools
+
+# Adaptation of https://github.com/luchr/ODEInterface.jl/blob/958b6023d1dabf775033d0b89c5401b33100bca3/examples/BasicExamples/ex7.jl
+function ex7_f!(du, u, p, t)
+    ϵ = p[1]
+    u₁, λ = u
+    du[1] = (sin(t)^2 + λ * sin(t)^4 / u₁) / ϵ
+    du[2] = 0
+    return nothing
+end
+
+function ex7_2pbc1!(resa, ua, p)
+    resa[1] = ua[1] - 1
+    return nothing
+end
+
+function ex7_2pbc2!(resb, ub, p)
+    resb[1] = ub[1] - 1
+    return nothing
+end
+
+u0 = [0.5, 1.0]
+p = [0.1]
+tspan = (-π / 2, π / 2)
+
+export ex7_f!, ex7_2pbc1!, ex7_2pbc2!, u0, p, tspan
+
+end
+
+@testitem "BVPM2" setup=[ODEInterfaceWrapperTestSetup] begin
+    using ODEInterface, RecursiveArrayTools, LinearAlgebra
+
+    tpprob = TwoPointBVProblem(ex7_f!, (ex7_2pbc1!, ex7_2pbc2!), u0, tspan,
+        p; bcresid_prototype = (zeros(1), zeros(1)))
+
+    sol_bvpm2 = solve(tpprob, BVPM2(); dt = π / 20)
+    @test SciMLBase.successful_retcode(sol_bvpm2)
+    resid_f = (Array{Float64, 1}(undef, 1), Array{Float64, 1}(undef, 1))
+    ex7_2pbc1!(resid_f[1], sol_bvpm2(tspan[1]), nothing)
+    ex7_2pbc2!(resid_f[2], sol_bvpm2(tspan[2]), nothing)
+    @test norm(resid_f, Inf) < 1e-6
+end
+
+# Just test that it runs. BVPSOL only works with linearly separable BCs.
+@testitem "BVPSOL" setup=[ODEInterfaceWrapperTestSetup] begin
+    using ODEInterface, RecursiveArrayTools
+
+    tpprob = TwoPointBVProblem(ex7_f!, (ex7_2pbc1!, ex7_2pbc2!), u0, tspan,
+        p; bcresid_prototype = (zeros(1), zeros(1)))
+
+    # Just generate a solution for bvpsol
+    sol_ms = solve(tpprob, MultipleShooting(10, DP5(), NewtonRaphson());
+        dt = π / 20, abstol = 1e-5, maxiters = 1000, adaptive = false)
+
+    initial_u0 = [sol_ms(t) .+ rand() for t in tspan[1]:(π / 20):tspan[2]]
+    tpprob = TwoPointBVProblem(ex7_f!, (ex7_2pbc1!, ex7_2pbc2!), initial_u0,
+        tspan, p; bcresid_prototype = (zeros(1), zeros(1)))
+    # Just test that it runs. BVPSOL only works with linearly separable BCs.
+    sol_bvpsol = solve(tpprob, BVPSOL(); dt = π / 20)
+
+    @test sol_bvpsol isa SciMLBase.ODESolution
+
+    initial_u0 = VectorOfArray([sol_ms(t) .+ rand() for t in tspan[1]:(π / 20):tspan[2]])
+    tpprob = TwoPointBVProblem(ex7_f!, (ex7_2pbc1!, ex7_2pbc2!), initial_u0,
+        tspan, p; bcresid_prototype = (zeros(1), zeros(1)))
+    # Just test that it runs. BVPSOL only works with linearly separable BCs.
+    sol_bvpsol = solve(tpprob, BVPSOL(); dt = π / 20)
+
+    @test sol_bvpsol isa SciMLBase.ODESolution
+
+    ts = collect(tspan[1]:(π / 20):tspan[2])
+    initial_u0 = DiffEqArray([sol_ms(t) .+ rand() for t in ts], ts)
+    tpprob = TwoPointBVProblem(ex7_f!, (ex7_2pbc1!, ex7_2pbc2!), initial_u0,
+        tspan, p; bcresid_prototype = (zeros(1), zeros(1)))
+    sol_bvpsol = solve(tpprob, BVPSOL(); dt = π / 20)
+
+    @test sol_bvpsol isa SciMLBase.ODESolution
+
+    initial_u0 = (p, t) -> sol_ms(t) .+ rand()
+    tpprob = TwoPointBVProblem(ex7_f!, (ex7_2pbc1!, ex7_2pbc2!), initial_u0,
+        tspan, p; bcresid_prototype = (zeros(1), zeros(1)))
+    sol_bvpsol = solve(tpprob, BVPSOL(); dt = π / 20)
+
+    @test sol_bvpsol isa SciMLBase.ODESolution
+end
+
+@testitem "COLNEW" setup=[ODEInterfaceWrapperTestSetup] begin
+    using ODEInterface, RecursiveArrayTools
+
+    function f!(du, u, p, t)
+        du[1] = u[2]
+        du[2] = u[1]
+    end
+    function bca!(resid_a, u_a, p)
+        resid_a[1] = u_a[1] - 1
+    end
+    function bcb!(resid_b, u_b, p)
+        resid_b[1] = u_b[1]
+    end
+
+    fun = BVPFunction(
+        f!, (bca!, bcb!), bcresid_prototype = (zeros(1), zeros(1)), twopoint = Val(true))
+    tspan = (0.0, 1.0)
+
+    prob = TwoPointBVProblem(fun, [1.0, 0.0], tspan)
+    sol_colnew = solve(prob, COLNEW(), dt = 0.01)
+    @test SciMLBase.successful_retcode(sol_colnew)
+end