diff --git a/go.mod b/go.mod index b18b7215..af25b888 100644 --- a/go.mod +++ b/go.mod @@ -43,6 +43,7 @@ require ( github.com/onsi/gomega v1.27.8 github.com/openshift-kni/eco-goinfra v0.0.0-20231006185241-4e996b952657 github.com/openshift-kni/k8sreporter v1.0.4 + gonum.org/v1/gonum v0.14.0 gopkg.in/yaml.v2 v2.4.0 k8s.io/api v0.27.1 k8s.io/apimachinery v0.27.1 diff --git a/go.sum b/go.sum index 34302f21..c894d441 100644 --- a/go.sum +++ b/go.sum @@ -1623,6 +1623,8 @@ golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1/go.mod h1:I/5z698sn9Ka8T gomodules.xyz/jsonpatch/v2 v2.1.0/go.mod h1:IhYNNY4jnS53ZnfE4PAmpKtDpTCj1JFXc+3mwe7XcUU= gomodules.xyz/jsonpatch/v2 v2.2.0 h1:4pT439QV83L+G9FkcCriY6EkpcK6r6bK+A5FBUMI7qY= gomodules.xyz/jsonpatch/v2 v2.2.0/go.mod h1:WXp+iVDkoLQqPudfQ9GBlwB2eZ5DKOnjQZCYdOS8GPY= +gonum.org/v1/gonum v0.14.0 h1:2NiG67LD1tEH0D7kM+ps2V+fXmsAnpUeec7n8tcr4S0= +gonum.org/v1/gonum v0.14.0/go.mod h1:AoWeoz0becf9QMWtE8iWXNXc27fK4fNeHNf/oMejGfU= google.golang.org/api v0.3.1/go.mod h1:6wY9I6uQWHQ8EM57III9mq/AjF+i8G65rmVagqKMtkk= google.golang.org/api v0.3.2/go.mod h1:6wY9I6uQWHQ8EM57III9mq/AjF+i8G65rmVagqKMtkk= google.golang.org/api v0.4.0/go.mod h1:8k5glujaEP+g9n7WNsDg8QP6cUVNI86fCNMcbazEtwE= diff --git a/tests/ran-du/internal/randuconfig/config.go b/tests/ran-du/internal/randuconfig/config.go index 1aac9e5e..ef79d5c2 100644 --- a/tests/ran-du/internal/randuconfig/config.go +++ b/tests/ran-du/internal/randuconfig/config.go @@ -24,9 +24,10 @@ type RanDuConfig struct { CreateMethod string `yaml:"create_method" envconfig:"ECO_RANDU_TESTWORKLOAD_CREATE_METHOD"` CreateShellCmd string `yaml:"create_shell_cmd" envconfig:"ECO_RANDU_TESTWORKLOAD_CREATE_SHELLCMD"` } `yaml:"randu_test_workload"` - SoftRebootIterations string `yaml:"soft_reboot_iterations" envconfig:"ECO_RANDU_SOFT_REBOOT_ITERATIONS"` - HardRebootIterations string `yaml:"hard_reboot_iterations" envconfig:"ECO_RANDU_HARD_REBOOT_ITERATIONS"` - IpmiToolImage string `yaml:"ipmitool_image" envconfig:"ECO_RANDU_IPMITOOL_IMAGE"` + SoftRebootIterations string `yaml:"soft_reboot_iterations" envconfig:"ECO_RANDU_SOFT_REBOOT_ITERATIONS"` + HardRebootIterations string `yaml:"hard_reboot_iterations" envconfig:"ECO_RANDU_HARD_REBOOT_ITERATIONS"` + IpmiToolImage string `yaml:"ipmitool_image" envconfig:"ECO_RANDU_IPMITOOL_IMAGE"` + LaunchWorkloadIterations string `yaml:"launch_workload_iterations" envconfig:"ECO_RANDU_LAUNCH_WORKLOAD_ITERATIONS"` } // NewRanDuConfig returns instance of RanDuConfig config type. diff --git a/tests/ran-du/internal/randuconfig/default.yaml b/tests/ran-du/internal/randuconfig/default.yaml index 496cfd26..078a6b2b 100644 --- a/tests/ran-du/internal/randuconfig/default.yaml +++ b/tests/ran-du/internal/randuconfig/default.yaml @@ -6,3 +6,4 @@ randu_test_workload: create_shell_cmd: '/opt/vdu-workload-emulator/add_test-deployments.sh' soft_reboot_iterations: '5' hard_reboot_iterations: '5' +launch_workload_iterations: '5' \ No newline at end of file diff --git a/tests/ran-du/internal/randuparams/randuvars.go b/tests/ran-du/internal/randuparams/randuvars.go index 6dd36fcd..eddd5ebc 100644 --- a/tests/ran-du/internal/randuparams/randuvars.go +++ b/tests/ran-du/internal/randuparams/randuvars.go @@ -22,4 +22,8 @@ var ( // TestNamespaceName is used for defining the namespace name where test resources are created. TestNamespaceName = "ran-du-system-tests" + + // TestMultipleLaunchWorkloadLoadAvg is used for defining the node load average threshold to be + // used in the LaunchWorkloadMultipleIterations test. + TestMultipleLaunchWorkloadLoadAvg = 100 ) diff --git a/tests/ran-du/tests/launch-workload-multiple-iter-loadavg.go b/tests/ran-du/tests/launch-workload-multiple-iter-loadavg.go new file mode 100644 index 00000000..fcdd8e1a --- /dev/null +++ b/tests/ran-du/tests/launch-workload-multiple-iter-loadavg.go @@ -0,0 +1,98 @@ +package ran_du_system_test + +import ( + "fmt" + "strconv" + "strings" + "time" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + "github.com/openshift-kni/eco-goinfra/pkg/namespace" + "github.com/openshift-kni/eco-goinfra/pkg/nodes" + "github.com/openshift-kni/eco-goinfra/pkg/polarion" + "github.com/openshift-kni/eco-gosystem/tests/internal/await" + "github.com/openshift-kni/eco-gosystem/tests/internal/cmd" + "github.com/openshift-kni/eco-gosystem/tests/internal/shell" + . "github.com/openshift-kni/eco-gosystem/tests/ran-du/internal/randuinittools" + "github.com/openshift-kni/eco-gosystem/tests/ran-du/internal/randuparams" + "github.com/openshift-kni/eco-gosystem/tests/ran-du/internal/randutestworkload" + "gonum.org/v1/gonum/floats" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" +) + +var _ = Describe( + "LaunchWorkloadMultipleIterations", + Ordered, + ContinueOnFailure, + Label("LaunchWorkloadMultipleIterations"), func() { + It("Launch workload multiple times", polarion.ID("45698"), Label("LaunchWorkloadMultipleIterations"), func() { + + By("Clean up workload namespace") + if namespace.NewBuilder(APIClient, RanDuTestConfig.TestWorkload.Namespace).Exists() { + err := randutestworkload.CleanNameSpace(randuparams.DefaultTimeout, RanDuTestConfig.TestWorkload.Namespace) + Expect(err).ToNot(HaveOccurred(), "Failed to clean workload test namespace objects") + } + + launchIterations, err := strconv.Atoi(RanDuTestConfig.LaunchWorkloadIterations) + if err != nil { + fmt.Println(err) + } + + By("Launch workload") + for iter := 0; iter < launchIterations; iter++ { + fmt.Printf("Launch workload iteration no. %d\n", iter) + if RanDuTestConfig.TestWorkload.CreateMethod == randuparams.TestWorkloadShellLaunchMethod { + By("Launching workload using shell method") + _, err := shell.ExecuteCmd(RanDuTestConfig.TestWorkload.CreateShellCmd) + Expect(err).ToNot(HaveOccurred(), "Failed to launch workload") + } + + By("Waiting for deployment replicas to become ready") + _, err := await.WaitUntilAllDeploymentsReady(APIClient, RanDuTestConfig.TestWorkload.Namespace, + randuparams.DefaultTimeout) + Expect(err).ToNot(HaveOccurred(), "error while waiting for deployment to become ready") + + By("Waiting for statefulset replicas to become ready") + _, err = await.WaitUntilAllStatefulSetsReady(APIClient, RanDuTestConfig.TestWorkload.Namespace, + randuparams.DefaultTimeout) + Expect(err).ToNot(HaveOccurred(), "error while waiting for statefulsets to become ready") + + By("Waiting for all pods to become ready") + _, err = await.WaitUntilAllPodsReady(APIClient, RanDuTestConfig.TestWorkload.Namespace, 10*time.Second) + Expect(err).ToNot(HaveOccurred(), "pod not ready: %s", err) + } + By("Retrieve nodes list") + nodeList, err := nodes.List( + APIClient, + metav1.ListOptions{}, + ) + Expect(err).ToNot(HaveOccurred(), "Error listing nodes.") + + By("Observe node load average while workload is running") + cmdToExec := []string{"awk", "{print $1}", "/proc/loadavg"} + var observedLoadAverage []float64 + + for n := 0; n < 30; n++ { + buf, err := cmd.ExecCmd(cmdToExec, nodeList[0].Definition.Name) + Expect(err).ToNot(HaveOccurred(), "could not execute command: %s", err) + + floatBuf, err := strconv.ParseFloat(strings.ReplaceAll(strings.ReplaceAll(buf, "\r", ""), "\n", ""), 32) + if err != nil { + fmt.Println(err) + } + + observedLoadAverage = append(observedLoadAverage, floatBuf) + time.Sleep(10 * time.Second) + } + + Expect(floats.Max(observedLoadAverage)).To(BeNumerically("<", randuparams.TestMultipleLaunchWorkloadLoadAvg), + "error: node load average detected above 100") + + }) + AfterAll(func() { + By("Cleaning up test workload resources") + err := randutestworkload.CleanNameSpace(randuparams.DefaultTimeout, RanDuTestConfig.TestWorkload.Namespace) + Expect(err).ToNot(HaveOccurred(), "Failed to clean workload test namespace objects") + }) + }) diff --git a/vendor/gonum.org/v1/gonum/AUTHORS b/vendor/gonum.org/v1/gonum/AUTHORS new file mode 100644 index 00000000..91467eaa --- /dev/null +++ b/vendor/gonum.org/v1/gonum/AUTHORS @@ -0,0 +1,128 @@ +# This is the official list of Gonum authors for copyright purposes. +# This file is distinct from the CONTRIBUTORS files. +# See the latter for an explanation. + +# Names should be added to this file as +# Name or Organization +# The email address is not required for organizations. + +# Please keep the list sorted. + +Alexander Egurnov +Andrei Blinnikov +antichris +Bailey Lissington +Bill Gray +Bill Noon +Brendan Tracey +Brent Pedersen +Bulat Khasanov +Chad Kunde +Chan Kwan Yin +Chih-Wei Chang +Chong-Yeol Nah +Chris Tessum +Christophe Meessen +Christopher Waldon +Clayton Northey +Dan Kortschak +Daniel Fireman +Dario Heinisch +David Kleiven +David Samborski +Davor Kapsa +DeepMind Technologies +Delaney Gillilan +Dezmond Goff +Dong-hee Na +Dustin Spicuzza +Egon Elbre +Ekaterina Efimova +Ethan Burns +Evert Lammerts +Evgeny Savinov +Fabian Wickborn +Facundo Gaich +Fazlul Shahriar +Francesc Campoy +Google Inc +Gustaf Johansson +Hossein Zolfi +Iakov Davydov +Igor Mikushkin +Iskander Sharipov +Jalem Raj Rohit +James Bell +James Bowman +James Holmes <32bitkid@gmail.com> +Janne Snabb +Jeremy Atkinson +Jes Cok +Jinesi Yelizati +Jonas Kahler +Jonas Schulze +Jonathan J Lawlor +Jonathan Reiter +Jonathan Schroeder +Joost van Amersfoort +Joseph Watson +Josh Wilson +Julien Roland +Kai Trukenmüller +Kent English +Kevin C. Zimmerman +Kirill Motkov +Konstantin Shaposhnikov +Leonid Kneller +Lyron Winderbaum +Marco Leogrande +Mark Canning +Mark Skilbeck +Martin Diz +Matthew Connelly +Matthieu Di Mercurio +Max Halford +Maxim Sergeev +Microsoft Corporation +MinJae Kwon +Nathan Edwards +Nick Potts +Nils Wogatzky +Olivier Wulveryck +Or Rikon +Patricio Whittingslow +Patrick DeVivo +Pontus Melke +Renee French +Rishi Desai +Robin Eklind +Roger Welin +Rondall Jones +Sam Zaydel +Samuel Kelemen +Saran Ahluwalia +Scott Holden +Scott Kiesel +Sebastien Binet +Shawn Smith +Sintela Ltd +source{d} +Spencer Lyon +Steve McCoy +Taesu Pyo +Takeshi Yoneda +Tamir Hyman +The University of Adelaide +The University of Minnesota +The University of Washington +Thomas Berg +Tobin Harding +Valentin Deleplace +Vincent Thiery +Vladimír Chalupecký +Will Tekulve +Yasuhiro Matsumoto +Yevgeniy Vahlis +Yucheng Zhu +Yunomi +Zoe Juozapaitis diff --git a/vendor/gonum.org/v1/gonum/CONTRIBUTORS b/vendor/gonum.org/v1/gonum/CONTRIBUTORS new file mode 100644 index 00000000..93e01a2c --- /dev/null +++ b/vendor/gonum.org/v1/gonum/CONTRIBUTORS @@ -0,0 +1,131 @@ +# This is the official list of people who can contribute +# (and typically have contributed) code to the Gonum +# project. +# +# The AUTHORS file lists the copyright holders; this file +# lists people. For example, Google employees would be listed here +# but not in AUTHORS, because Google would hold the copyright. +# +# When adding J Random Contributor's name to this file, +# either J's name or J's organization's name should be +# added to the AUTHORS file. +# +# Names should be added to this file like so: +# Name +# +# Please keep the list sorted. + +Alexander Egurnov +Andrei Blinnikov +Andrew Brampton +antichris +Bailey Lissington +Bill Gray +Bill Noon +Brendan Tracey +Brent Pedersen +Bulat Khasanov +Chad Kunde +Chan Kwan Yin +Chih-Wei Chang +Chong-Yeol Nah +Chris Tessum +Christophe Meessen +Christopher Waldon +Clayton Northey +Dan Kortschak +Dan Lorenc +Daniel Fireman +Dario Heinisch +David Kleiven +David Samborski +Davor Kapsa +Delaney Gillilan +Dezmond Goff +Dong-hee Na +Dustin Spicuzza +Egon Elbre +Ekaterina Efimova +Ethan Burns +Evert Lammerts +Evgeny Savinov +Fabian Wickborn +Facundo Gaich +Fazlul Shahriar +Francesc Campoy +Gustaf Johansson +Hossein Zolfi +Iakov Davydov +Igor Mikushkin +Iskander Sharipov +Jalem Raj Rohit +James Bell +James Bowman +James Holmes <32bitkid@gmail.com> +Janne Snabb +Jeremy Atkinson +Jes Cok +Jinesi Yelizati +Jon Richards +Jonas Kahler +Jonas Schulze +Jonathan J Lawlor +Jonathan Reiter +Jonathan Schroeder +Joost van Amersfoort +Joseph Watson +Josh Wilson +Julien Roland +Kai Trukenmüller +Kent English +Kevin C. Zimmerman +Kirill Motkov +Konstantin Shaposhnikov +Leonid Kneller +Lyron Winderbaum +Marco Leogrande +Mark Canning +Mark Skilbeck +Martin Diz +Matthew Connelly +Matthieu Di Mercurio +Max Halford +Maxim Sergeev +MinJae Kwon +Nathan Edwards +Nick Potts +Nils Wogatzky +Olivier Wulveryck +Or Rikon +Patricio Whittingslow +Patrick DeVivo +Pontus Melke +Renee French +Rishi Desai +Robin Eklind +Roger Welin +Roman Werpachowski +Rondall Jones +Sam Zaydel +Samuel Kelemen +Saran Ahluwalia +Scott Holden +Scott Kiesel +Sebastien Binet +Shawn Smith +Spencer Lyon +Steve McCoy +Taesu Pyo +Takeshi Yoneda +Tamir Hyman +Thomas Berg +Tobin Harding +Valentin Deleplace +Vincent Thiery +Vladimír Chalupecký +Will Tekulve +Yasuhiro Matsumoto +Yevgeniy Vahlis +Yucheng Zhu +Yunomi +Zoe Juozapaitis diff --git a/vendor/gonum.org/v1/gonum/LICENSE b/vendor/gonum.org/v1/gonum/LICENSE new file mode 100644 index 00000000..ed477e59 --- /dev/null +++ b/vendor/gonum.org/v1/gonum/LICENSE @@ -0,0 +1,23 @@ +Copyright ©2013 The Gonum Authors. All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the Gonum project nor the names of its authors and + contributors may be used to endorse or promote products derived from this + software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. \ No newline at end of file diff --git a/vendor/gonum.org/v1/gonum/floats/README.md b/vendor/gonum.org/v1/gonum/floats/README.md new file mode 100644 index 00000000..e8ef46d5 --- /dev/null +++ b/vendor/gonum.org/v1/gonum/floats/README.md @@ -0,0 +1,7 @@ +# Gonum floats + +[![go.dev reference](https://pkg.go.dev/badge/gonum.org/v1/gonum/floats)](https://pkg.go.dev/gonum.org/v1/gonum/floats) +[![GoDoc](https://godocs.io/gonum.org/v1/gonum/floats?status.svg)](https://godocs.io/gonum.org/v1/gonum/floats) + +Package floats provides a set of helper routines for dealing with slices of float64. +The functions avoid allocations to allow for use within tight loops without garbage collection overhead. diff --git a/vendor/gonum.org/v1/gonum/floats/doc.go b/vendor/gonum.org/v1/gonum/floats/doc.go new file mode 100644 index 00000000..bfe05c19 --- /dev/null +++ b/vendor/gonum.org/v1/gonum/floats/doc.go @@ -0,0 +1,11 @@ +// Copyright ©2017 The Gonum Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Package floats provides a set of helper routines for dealing with slices +// of float64. The functions avoid allocations to allow for use within tight +// loops without garbage collection overhead. +// +// The convention used is that when a slice is being modified in place, it has +// the name dst. +package floats // import "gonum.org/v1/gonum/floats" diff --git a/vendor/gonum.org/v1/gonum/floats/floats.go b/vendor/gonum.org/v1/gonum/floats/floats.go new file mode 100644 index 00000000..5db73a05 --- /dev/null +++ b/vendor/gonum.org/v1/gonum/floats/floats.go @@ -0,0 +1,807 @@ +// Copyright ©2013 The Gonum Authors. All rights reserved. +// Use of this code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package floats + +import ( + "errors" + "math" + "sort" + + "gonum.org/v1/gonum/floats/scalar" + "gonum.org/v1/gonum/internal/asm/f64" +) + +const ( + zeroLength = "floats: zero length slice" + shortSpan = "floats: slice length less than 2" + badLength = "floats: slice lengths do not match" + badDstLength = "floats: destination slice length does not match input" +) + +// Add adds, element-wise, the elements of s and dst, and stores the result in dst. +// It panics if the argument lengths do not match. +func Add(dst, s []float64) { + if len(dst) != len(s) { + panic(badDstLength) + } + f64.AxpyUnitaryTo(dst, 1, s, dst) +} + +// AddTo adds, element-wise, the elements of s and t and +// stores the result in dst. +// It panics if the argument lengths do not match. +func AddTo(dst, s, t []float64) []float64 { + if len(s) != len(t) { + panic(badLength) + } + if len(dst) != len(s) { + panic(badDstLength) + } + f64.AxpyUnitaryTo(dst, 1, s, t) + return dst +} + +// AddConst adds the scalar c to all of the values in dst. +func AddConst(c float64, dst []float64) { + f64.AddConst(c, dst) +} + +// AddScaled performs dst = dst + alpha * s. +// It panics if the slice argument lengths do not match. +func AddScaled(dst []float64, alpha float64, s []float64) { + if len(dst) != len(s) { + panic(badLength) + } + f64.AxpyUnitaryTo(dst, alpha, s, dst) +} + +// AddScaledTo performs dst = y + alpha * s, where alpha is a scalar, +// and dst, y and s are all slices. +// It panics if the slice argument lengths do not match. +// +// At the return of the function, dst[i] = y[i] + alpha * s[i] +func AddScaledTo(dst, y []float64, alpha float64, s []float64) []float64 { + if len(s) != len(y) { + panic(badLength) + } + if len(dst) != len(y) { + panic(badDstLength) + } + f64.AxpyUnitaryTo(dst, alpha, s, y) + return dst +} + +// argsort is a helper that implements sort.Interface, as used by +// Argsort and ArgsortStable. +type argsort struct { + s []float64 + inds []int +} + +func (a argsort) Len() int { + return len(a.s) +} + +func (a argsort) Less(i, j int) bool { + return a.s[i] < a.s[j] +} + +func (a argsort) Swap(i, j int) { + a.s[i], a.s[j] = a.s[j], a.s[i] + a.inds[i], a.inds[j] = a.inds[j], a.inds[i] +} + +// Argsort sorts the elements of dst while tracking their original order. +// At the conclusion of Argsort, dst will contain the original elements of dst +// but sorted in increasing order, and inds will contain the original position +// of the elements in the slice such that dst[i] = origDst[inds[i]]. +// It panics if the argument lengths do not match. +func Argsort(dst []float64, inds []int) { + if len(dst) != len(inds) { + panic(badDstLength) + } + for i := range dst { + inds[i] = i + } + + a := argsort{s: dst, inds: inds} + sort.Sort(a) +} + +// ArgsortStable sorts the elements of dst while tracking their original order and +// keeping the original order of equal elements. At the conclusion of ArgsortStable, +// dst will contain the original elements of dst but sorted in increasing order, +// and inds will contain the original position of the elements in the slice such +// that dst[i] = origDst[inds[i]]. +// It panics if the argument lengths do not match. +func ArgsortStable(dst []float64, inds []int) { + if len(dst) != len(inds) { + panic(badDstLength) + } + for i := range dst { + inds[i] = i + } + + a := argsort{s: dst, inds: inds} + sort.Stable(a) +} + +// Count applies the function f to every element of s and returns the number +// of times the function returned true. +func Count(f func(float64) bool, s []float64) int { + var n int + for _, val := range s { + if f(val) { + n++ + } + } + return n +} + +// CumProd finds the cumulative product of the first i elements in +// s and puts them in place into the ith element of the +// destination dst. +// It panics if the argument lengths do not match. +// +// At the return of the function, dst[i] = s[i] * s[i-1] * s[i-2] * ... +func CumProd(dst, s []float64) []float64 { + if len(dst) != len(s) { + panic(badDstLength) + } + if len(dst) == 0 { + return dst + } + return f64.CumProd(dst, s) +} + +// CumSum finds the cumulative sum of the first i elements in +// s and puts them in place into the ith element of the +// destination dst. +// It panics if the argument lengths do not match. +// +// At the return of the function, dst[i] = s[i] + s[i-1] + s[i-2] + ... +func CumSum(dst, s []float64) []float64 { + if len(dst) != len(s) { + panic(badDstLength) + } + if len(dst) == 0 { + return dst + } + return f64.CumSum(dst, s) +} + +// Distance computes the L-norm of s - t. See Norm for special cases. +// It panics if the slice argument lengths do not match. +func Distance(s, t []float64, L float64) float64 { + if len(s) != len(t) { + panic(badLength) + } + if len(s) == 0 { + return 0 + } + if L == 2 { + return f64.L2DistanceUnitary(s, t) + } + var norm float64 + if L == 1 { + for i, v := range s { + norm += math.Abs(t[i] - v) + } + return norm + } + if math.IsInf(L, 1) { + for i, v := range s { + absDiff := math.Abs(t[i] - v) + if absDiff > norm { + norm = absDiff + } + } + return norm + } + for i, v := range s { + norm += math.Pow(math.Abs(t[i]-v), L) + } + return math.Pow(norm, 1/L) +} + +// Div performs element-wise division dst / s +// and stores the value in dst. +// It panics if the argument lengths do not match. +func Div(dst, s []float64) { + if len(dst) != len(s) { + panic(badLength) + } + f64.Div(dst, s) +} + +// DivTo performs element-wise division s / t +// and stores the value in dst. +// It panics if the argument lengths do not match. +func DivTo(dst, s, t []float64) []float64 { + if len(s) != len(t) { + panic(badLength) + } + if len(dst) != len(s) { + panic(badDstLength) + } + return f64.DivTo(dst, s, t) +} + +// Dot computes the dot product of s1 and s2, i.e. +// sum_{i = 1}^N s1[i]*s2[i]. +// It panics if the argument lengths do not match. +func Dot(s1, s2 []float64) float64 { + if len(s1) != len(s2) { + panic(badLength) + } + return f64.DotUnitary(s1, s2) +} + +// Equal returns true when the slices have equal lengths and +// all elements are numerically identical. +func Equal(s1, s2 []float64) bool { + if len(s1) != len(s2) { + return false + } + for i, val := range s1 { + if s2[i] != val { + return false + } + } + return true +} + +// EqualApprox returns true when the slices have equal lengths and +// all element pairs have an absolute tolerance less than tol or a +// relative tolerance less than tol. +func EqualApprox(s1, s2 []float64, tol float64) bool { + if len(s1) != len(s2) { + return false + } + for i, a := range s1 { + if !scalar.EqualWithinAbsOrRel(a, s2[i], tol, tol) { + return false + } + } + return true +} + +// EqualFunc returns true when the slices have the same lengths +// and the function returns true for all element pairs. +func EqualFunc(s1, s2 []float64, f func(float64, float64) bool) bool { + if len(s1) != len(s2) { + return false + } + for i, val := range s1 { + if !f(val, s2[i]) { + return false + } + } + return true +} + +// EqualLengths returns true when all of the slices have equal length, +// and false otherwise. It also returns true when there are no input slices. +func EqualLengths(slices ...[]float64) bool { + // This length check is needed: http://play.golang.org/p/sdty6YiLhM + if len(slices) == 0 { + return true + } + l := len(slices[0]) + for i := 1; i < len(slices); i++ { + if len(slices[i]) != l { + return false + } + } + return true +} + +// Find applies f to every element of s and returns the indices of the first +// k elements for which the f returns true, or all such elements +// if k < 0. +// Find will reslice inds to have 0 length, and will append +// found indices to inds. +// If k > 0 and there are fewer than k elements in s satisfying f, +// all of the found elements will be returned along with an error. +// At the return of the function, the input inds will be in an undetermined state. +func Find(inds []int, f func(float64) bool, s []float64, k int) ([]int, error) { + // inds is also returned to allow for calling with nil. + + // Reslice inds to have zero length. + inds = inds[:0] + + // If zero elements requested, can just return. + if k == 0 { + return inds, nil + } + + // If k < 0, return all of the found indices. + if k < 0 { + for i, val := range s { + if f(val) { + inds = append(inds, i) + } + } + return inds, nil + } + + // Otherwise, find the first k elements. + nFound := 0 + for i, val := range s { + if f(val) { + inds = append(inds, i) + nFound++ + if nFound == k { + return inds, nil + } + } + } + // Finished iterating over the loop, which means k elements were not found. + return inds, errors.New("floats: insufficient elements found") +} + +// HasNaN returns true when the slice s has any values that are NaN and false +// otherwise. +func HasNaN(s []float64) bool { + for _, v := range s { + if math.IsNaN(v) { + return true + } + } + return false +} + +// LogSpan returns a set of n equally spaced points in log space between, +// l and u where N is equal to len(dst). The first element of the +// resulting dst will be l and the final element of dst will be u. +// It panics if the length of dst is less than 2. +// Note that this call will return NaNs if either l or u are negative, and +// will return all zeros if l or u is zero. +// Also returns the mutated slice dst, so that it can be used in range, like: +// +// for i, x := range LogSpan(dst, l, u) { ... } +func LogSpan(dst []float64, l, u float64) []float64 { + Span(dst, math.Log(l), math.Log(u)) + for i := range dst { + dst[i] = math.Exp(dst[i]) + } + return dst +} + +// LogSumExp returns the log of the sum of the exponentials of the values in s. +// Panics if s is an empty slice. +func LogSumExp(s []float64) float64 { + // Want to do this in a numerically stable way which avoids + // overflow and underflow + // First, find the maximum value in the slice. + maxval := Max(s) + if math.IsInf(maxval, 0) { + // If it's infinity either way, the logsumexp will be infinity as well + // returning now avoids NaNs + return maxval + } + var lse float64 + // Compute the sumexp part + for _, val := range s { + lse += math.Exp(val - maxval) + } + // Take the log and add back on the constant taken out + return math.Log(lse) + maxval +} + +// Max returns the maximum value in the input slice. If the slice is empty, Max will panic. +func Max(s []float64) float64 { + return s[MaxIdx(s)] +} + +// MaxIdx returns the index of the maximum value in the input slice. If several +// entries have the maximum value, the first such index is returned. +// It panics if s is zero length. +func MaxIdx(s []float64) int { + if len(s) == 0 { + panic(zeroLength) + } + max := math.NaN() + var ind int + for i, v := range s { + if math.IsNaN(v) { + continue + } + if v > max || math.IsNaN(max) { + max = v + ind = i + } + } + return ind +} + +// Min returns the minimum value in the input slice. +// It panics if s is zero length. +func Min(s []float64) float64 { + return s[MinIdx(s)] +} + +// MinIdx returns the index of the minimum value in the input slice. If several +// entries have the minimum value, the first such index is returned. +// It panics if s is zero length. +func MinIdx(s []float64) int { + if len(s) == 0 { + panic(zeroLength) + } + min := math.NaN() + var ind int + for i, v := range s { + if math.IsNaN(v) { + continue + } + if v < min || math.IsNaN(min) { + min = v + ind = i + } + } + return ind +} + +// Mul performs element-wise multiplication between dst +// and s and stores the value in dst. +// It panics if the argument lengths do not match. +func Mul(dst, s []float64) { + if len(dst) != len(s) { + panic(badLength) + } + for i, val := range s { + dst[i] *= val + } +} + +// MulTo performs element-wise multiplication between s +// and t and stores the value in dst. +// It panics if the argument lengths do not match. +func MulTo(dst, s, t []float64) []float64 { + if len(s) != len(t) { + panic(badLength) + } + if len(dst) != len(s) { + panic(badDstLength) + } + for i, val := range t { + dst[i] = val * s[i] + } + return dst +} + +// NearestIdx returns the index of the element in s +// whose value is nearest to v. If several such +// elements exist, the lowest index is returned. +// It panics if s is zero length. +func NearestIdx(s []float64, v float64) int { + if len(s) == 0 { + panic(zeroLength) + } + switch { + case math.IsNaN(v): + return 0 + case math.IsInf(v, 1): + return MaxIdx(s) + case math.IsInf(v, -1): + return MinIdx(s) + } + var ind int + dist := math.NaN() + for i, val := range s { + newDist := math.Abs(v - val) + // A NaN distance will not be closer. + if math.IsNaN(newDist) { + continue + } + if newDist < dist || math.IsNaN(dist) { + dist = newDist + ind = i + } + } + return ind +} + +// NearestIdxForSpan return the index of a hypothetical vector created +// by Span with length n and bounds l and u whose value is closest +// to v. That is, NearestIdxForSpan(n, l, u, v) is equivalent to +// Nearest(Span(make([]float64, n),l,u),v) without an allocation. +// It panics if n is less than two. +func NearestIdxForSpan(n int, l, u float64, v float64) int { + if n < 2 { + panic(shortSpan) + } + if math.IsNaN(v) { + return 0 + } + + // Special cases for Inf and NaN. + switch { + case math.IsNaN(l) && !math.IsNaN(u): + return n - 1 + case math.IsNaN(u): + return 0 + case math.IsInf(l, 0) && math.IsInf(u, 0): + if l == u { + return 0 + } + if n%2 == 1 { + if !math.IsInf(v, 0) { + return n / 2 + } + if math.Copysign(1, v) == math.Copysign(1, l) { + return 0 + } + return n/2 + 1 + } + if math.Copysign(1, v) == math.Copysign(1, l) { + return 0 + } + return n / 2 + case math.IsInf(l, 0): + if v == l { + return 0 + } + return n - 1 + case math.IsInf(u, 0): + if v == u { + return n - 1 + } + return 0 + case math.IsInf(v, -1): + if l <= u { + return 0 + } + return n - 1 + case math.IsInf(v, 1): + if u <= l { + return 0 + } + return n - 1 + } + + // Special cases for v outside (l, u) and (u, l). + switch { + case l < u: + if v <= l { + return 0 + } + if v >= u { + return n - 1 + } + case l > u: + if v >= l { + return 0 + } + if v <= u { + return n - 1 + } + default: + return 0 + } + + // Can't guarantee anything about exactly halfway between + // because of floating point weirdness. + return int((float64(n)-1)/(u-l)*(v-l) + 0.5) +} + +// Norm returns the L norm of the slice S, defined as +// (sum_{i=1}^N s[i]^L)^{1/L} +// Special cases: +// L = math.Inf(1) gives the maximum absolute value. +// Does not correctly compute the zero norm (use Count). +func Norm(s []float64, L float64) float64 { + // Should this complain if L is not positive? + // Should this be done in log space for better numerical stability? + // would be more cost + // maybe only if L is high? + if len(s) == 0 { + return 0 + } + if L == 2 { + return f64.L2NormUnitary(s) + } + var norm float64 + if L == 1 { + for _, val := range s { + norm += math.Abs(val) + } + return norm + } + if math.IsInf(L, 1) { + for _, val := range s { + norm = math.Max(norm, math.Abs(val)) + } + return norm + } + for _, val := range s { + norm += math.Pow(math.Abs(val), L) + } + return math.Pow(norm, 1/L) +} + +// Prod returns the product of the elements of the slice. +// Returns 1 if len(s) = 0. +func Prod(s []float64) float64 { + prod := 1.0 + for _, val := range s { + prod *= val + } + return prod +} + +// Reverse reverses the order of elements in the slice. +func Reverse(s []float64) { + for i, j := 0, len(s)-1; i < j; i, j = i+1, j-1 { + s[i], s[j] = s[j], s[i] + } +} + +// Same returns true when the input slices have the same length and all +// elements have the same value with NaN treated as the same. +func Same(s, t []float64) bool { + if len(s) != len(t) { + return false + } + for i, v := range s { + w := t[i] + if v != w && !(math.IsNaN(v) && math.IsNaN(w)) { + return false + } + } + return true +} + +// Scale multiplies every element in dst by the scalar c. +func Scale(c float64, dst []float64) { + if len(dst) > 0 { + f64.ScalUnitary(c, dst) + } +} + +// ScaleTo multiplies the elements in s by c and stores the result in dst. +// It panics if the slice argument lengths do not match. +func ScaleTo(dst []float64, c float64, s []float64) []float64 { + if len(dst) != len(s) { + panic(badDstLength) + } + if len(dst) > 0 { + f64.ScalUnitaryTo(dst, c, s) + } + return dst +} + +// Span returns a set of N equally spaced points between l and u, where N +// is equal to the length of the destination. The first element of the destination +// is l, the final element of the destination is u. +// It panics if the length of dst is less than 2. +// +// Span also returns the mutated slice dst, so that it can be used in range expressions, +// like: +// +// for i, x := range Span(dst, l, u) { ... } +func Span(dst []float64, l, u float64) []float64 { + n := len(dst) + if n < 2 { + panic(shortSpan) + } + + // Special cases for Inf and NaN. + switch { + case math.IsNaN(l): + for i := range dst[:len(dst)-1] { + dst[i] = math.NaN() + } + dst[len(dst)-1] = u + return dst + case math.IsNaN(u): + for i := range dst[1:] { + dst[i+1] = math.NaN() + } + dst[0] = l + return dst + case math.IsInf(l, 0) && math.IsInf(u, 0): + for i := range dst[:len(dst)/2] { + dst[i] = l + dst[len(dst)-i-1] = u + } + if len(dst)%2 == 1 { + if l != u { + dst[len(dst)/2] = 0 + } else { + dst[len(dst)/2] = l + } + } + return dst + case math.IsInf(l, 0): + for i := range dst[:len(dst)-1] { + dst[i] = l + } + dst[len(dst)-1] = u + return dst + case math.IsInf(u, 0): + for i := range dst[1:] { + dst[i+1] = u + } + dst[0] = l + return dst + } + + step := (u - l) / float64(n-1) + for i := range dst { + dst[i] = l + step*float64(i) + } + return dst +} + +// Sub subtracts, element-wise, the elements of s from dst. +// It panics if the argument lengths do not match. +func Sub(dst, s []float64) { + if len(dst) != len(s) { + panic(badLength) + } + f64.AxpyUnitaryTo(dst, -1, s, dst) +} + +// SubTo subtracts, element-wise, the elements of t from s and +// stores the result in dst. +// It panics if the argument lengths do not match. +func SubTo(dst, s, t []float64) []float64 { + if len(s) != len(t) { + panic(badLength) + } + if len(dst) != len(s) { + panic(badDstLength) + } + f64.AxpyUnitaryTo(dst, -1, t, s) + return dst +} + +// Sum returns the sum of the elements of the slice. +func Sum(s []float64) float64 { + return f64.Sum(s) +} + +// Within returns the first index i where s[i] <= v < s[i+1]. Within panics if: +// - len(s) < 2 +// - s is not sorted +func Within(s []float64, v float64) int { + if len(s) < 2 { + panic(shortSpan) + } + if !sort.Float64sAreSorted(s) { + panic("floats: input slice not sorted") + } + if v < s[0] || v >= s[len(s)-1] || math.IsNaN(v) { + return -1 + } + for i, f := range s[1:] { + if v < f { + return i + } + } + return -1 +} + +// SumCompensated returns the sum of the elements of the slice calculated with greater +// accuracy than Sum at the expense of additional computation. +func SumCompensated(s []float64) float64 { + // SumCompensated uses an improved version of Kahan's compensated + // summation algorithm proposed by Neumaier. + // See https://en.wikipedia.org/wiki/Kahan_summation_algorithm for details. + var sum, c float64 + for _, x := range s { + // This type conversion is here to prevent a sufficiently smart compiler + // from optimising away these operations. + t := float64(sum + x) + if math.Abs(sum) >= math.Abs(x) { + c += (sum - t) + x + } else { + c += (x - t) + sum + } + sum = t + } + return sum + c +} diff --git a/vendor/gonum.org/v1/gonum/floats/scalar/doc.go b/vendor/gonum.org/v1/gonum/floats/scalar/doc.go new file mode 100644 index 00000000..9e69c193 --- /dev/null +++ b/vendor/gonum.org/v1/gonum/floats/scalar/doc.go @@ -0,0 +1,6 @@ +// Copyright ©2017 The Gonum Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Package scalar provides a set of helper routines for dealing with float64 values. +package scalar // import "gonum.org/v1/gonum/floats/scalar" diff --git a/vendor/gonum.org/v1/gonum/floats/scalar/scalar.go b/vendor/gonum.org/v1/gonum/floats/scalar/scalar.go new file mode 100644 index 00000000..46bf06b3 --- /dev/null +++ b/vendor/gonum.org/v1/gonum/floats/scalar/scalar.go @@ -0,0 +1,171 @@ +// Copyright ©2013 The Gonum Authors. All rights reserved. +// Use of this code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package scalar + +import ( + "math" + "strconv" +) + +// EqualWithinAbs returns true when a and b have an absolute difference +// not greater than tol. +func EqualWithinAbs(a, b, tol float64) bool { + return a == b || math.Abs(a-b) <= tol +} + +// minNormalFloat64 is the smallest normal number. For 64 bit IEEE-754 +// floats this is 2^{-1022}. +const minNormalFloat64 = 0x1p-1022 + +// EqualWithinRel returns true when the difference between a and b +// is not greater than tol times the greater absolute value of a and b, +// +// abs(a-b) <= tol * max(abs(a), abs(b)). +func EqualWithinRel(a, b, tol float64) bool { + if a == b { + return true + } + delta := math.Abs(a - b) + if delta <= minNormalFloat64 { + return delta <= tol*minNormalFloat64 + } + // We depend on the division in this relationship to identify + // infinities (we rely on the NaN to fail the test) otherwise + // we compare Infs of the same sign and evaluate Infs as equal + // independent of sign. + return delta/math.Max(math.Abs(a), math.Abs(b)) <= tol +} + +// EqualWithinAbsOrRel returns true when a and b are equal to within +// the absolute or relative tolerances. See EqualWithinAbs and +// EqualWithinRel for details. +func EqualWithinAbsOrRel(a, b, absTol, relTol float64) bool { + return EqualWithinAbs(a, b, absTol) || EqualWithinRel(a, b, relTol) +} + +// EqualWithinULP returns true when a and b are equal to within +// the specified number of floating point units in the last place. +func EqualWithinULP(a, b float64, ulp uint) bool { + if a == b { + return true + } + if math.IsNaN(a) || math.IsNaN(b) { + return false + } + if math.Signbit(a) != math.Signbit(b) { + return math.Float64bits(math.Abs(a))+math.Float64bits(math.Abs(b)) <= uint64(ulp) + } + return ulpDiff(math.Float64bits(a), math.Float64bits(b)) <= uint64(ulp) +} + +func ulpDiff(a, b uint64) uint64 { + if a > b { + return a - b + } + return b - a +} + +const ( + nanBits = 0x7ff8000000000000 + nanMask = 0xfff8000000000000 +) + +// NaNWith returns an IEEE 754 "quiet not-a-number" value with the +// payload specified in the low 51 bits of payload. +// The NaN returned by math.NaN has a bit pattern equal to NaNWith(1). +func NaNWith(payload uint64) float64 { + return math.Float64frombits(nanBits | (payload &^ nanMask)) +} + +// NaNPayload returns the lowest 51 bits payload of an IEEE 754 "quiet +// not-a-number". For values of f other than quiet-NaN, NaNPayload +// returns zero and false. +func NaNPayload(f float64) (payload uint64, ok bool) { + b := math.Float64bits(f) + if b&nanBits != nanBits { + return 0, false + } + return b &^ nanMask, true +} + +// ParseWithNA converts the string s to a float64 in value. +// If s equals missing, weight is returned as 0, otherwise 1. +func ParseWithNA(s, missing string) (value, weight float64, err error) { + if s == missing { + return 0, 0, nil + } + value, err = strconv.ParseFloat(s, 64) + if err == nil { + weight = 1 + } + return value, weight, err +} + +// Round returns the half away from zero rounded value of x with prec precision. +// +// Special cases are: +// +// Round(±0) = +0 +// Round(±Inf) = ±Inf +// Round(NaN) = NaN +func Round(x float64, prec int) float64 { + if x == 0 { + // Make sure zero is returned + // without the negative bit set. + return 0 + } + // Fast path for positive precision on integers. + if prec >= 0 && x == math.Trunc(x) { + return x + } + pow := math.Pow10(prec) + intermed := x * pow + if math.IsInf(intermed, 0) { + return x + } + x = math.Round(intermed) + + if x == 0 { + return 0 + } + + return x / pow +} + +// RoundEven returns the half even rounded value of x with prec precision. +// +// Special cases are: +// +// RoundEven(±0) = +0 +// RoundEven(±Inf) = ±Inf +// RoundEven(NaN) = NaN +func RoundEven(x float64, prec int) float64 { + if x == 0 { + // Make sure zero is returned + // without the negative bit set. + return 0 + } + // Fast path for positive precision on integers. + if prec >= 0 && x == math.Trunc(x) { + return x + } + pow := math.Pow10(prec) + intermed := x * pow + if math.IsInf(intermed, 0) { + return x + } + x = math.RoundToEven(intermed) + + if x == 0 { + return 0 + } + + return x / pow +} + +// Same returns true when the inputs have the same value, allowing NaN equality. +func Same(a, b float64) bool { + return a == b || (math.IsNaN(a) && math.IsNaN(b)) +} diff --git a/vendor/gonum.org/v1/gonum/internal/asm/f64/abssum_amd64.s b/vendor/gonum.org/v1/gonum/internal/asm/f64/abssum_amd64.s new file mode 100644 index 00000000..df63dc09 --- /dev/null +++ b/vendor/gonum.org/v1/gonum/internal/asm/f64/abssum_amd64.s @@ -0,0 +1,82 @@ +// Copyright ©2016 The Gonum Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// +build !noasm,!gccgo,!safe + +#include "textflag.h" + +// func L1Norm(x []float64) float64 +TEXT ·L1Norm(SB), NOSPLIT, $0 + MOVQ x_base+0(FP), SI // SI = &x + MOVQ x_len+8(FP), CX // CX = len(x) + XORQ AX, AX // i = 0 + PXOR X0, X0 // p_sum_i = 0 + PXOR X1, X1 + PXOR X2, X2 + PXOR X3, X3 + PXOR X4, X4 + PXOR X5, X5 + PXOR X6, X6 + PXOR X7, X7 + CMPQ CX, $0 // if CX == 0 { return 0 } + JE absum_end + MOVQ CX, BX + ANDQ $7, BX // BX = len(x) % 8 + SHRQ $3, CX // CX = floor( len(x) / 8 ) + JZ absum_tail_start // if CX == 0 { goto absum_tail_start } + +absum_loop: // do { + // p_sum += max( p_sum + x[i], p_sum - x[i] ) + MOVUPS (SI)(AX*8), X8 // X_i = x[i:i+1] + MOVUPS 16(SI)(AX*8), X9 + MOVUPS 32(SI)(AX*8), X10 + MOVUPS 48(SI)(AX*8), X11 + ADDPD X8, X0 // p_sum_i += X_i ( positive values ) + ADDPD X9, X2 + ADDPD X10, X4 + ADDPD X11, X6 + SUBPD X8, X1 // p_sum_(i+1) -= X_i ( negative values ) + SUBPD X9, X3 + SUBPD X10, X5 + SUBPD X11, X7 + MAXPD X1, X0 // p_sum_i = max( p_sum_i, p_sum_(i+1) ) + MAXPD X3, X2 + MAXPD X5, X4 + MAXPD X7, X6 + MOVAPS X0, X1 // p_sum_(i+1) = p_sum_i + MOVAPS X2, X3 + MOVAPS X4, X5 + MOVAPS X6, X7 + ADDQ $8, AX // i += 8 + LOOP absum_loop // } while --CX > 0 + + // p_sum_0 = \sum_{i=1}^{3}( p_sum_(i*2) ) + ADDPD X3, X0 + ADDPD X5, X7 + ADDPD X7, X0 + + // p_sum_0[0] = p_sum_0[0] + p_sum_0[1] + MOVAPS X0, X1 + SHUFPD $0x3, X0, X0 // lower( p_sum_0 ) = upper( p_sum_0 ) + ADDSD X1, X0 + CMPQ BX, $0 + JE absum_end // if BX == 0 { goto absum_end } + +absum_tail_start: // Reset loop registers + MOVQ BX, CX // Loop counter: CX = BX + XORPS X8, X8 // X_8 = 0 + +absum_tail: // do { + // p_sum += max( p_sum + x[i], p_sum - x[i] ) + MOVSD (SI)(AX*8), X8 // X_8 = x[i] + MOVSD X0, X1 // p_sum_1 = p_sum_0 + ADDSD X8, X0 // p_sum_0 += X_8 + SUBSD X8, X1 // p_sum_1 -= X_8 + MAXSD X1, X0 // p_sum_0 = max( p_sum_0, p_sum_1 ) + INCQ AX // i++ + LOOP absum_tail // } while --CX > 0 + +absum_end: // return p_sum_0 + MOVSD X0, sum+24(FP) + RET diff --git a/vendor/gonum.org/v1/gonum/internal/asm/f64/abssuminc_amd64.s b/vendor/gonum.org/v1/gonum/internal/asm/f64/abssuminc_amd64.s new file mode 100644 index 00000000..64751733 --- /dev/null +++ b/vendor/gonum.org/v1/gonum/internal/asm/f64/abssuminc_amd64.s @@ -0,0 +1,90 @@ +// Copyright ©2016 The Gonum Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// +build !noasm,!gccgo,!safe + +#include "textflag.h" + +// func L1NormInc(x []float64, n, incX int) (sum float64) +TEXT ·L1NormInc(SB), NOSPLIT, $0 + MOVQ x_base+0(FP), SI // SI = &x + MOVQ n+24(FP), CX // CX = n + MOVQ incX+32(FP), AX // AX = increment * sizeof( float64 ) + SHLQ $3, AX + MOVQ AX, DX // DX = AX * 3 + IMULQ $3, DX + PXOR X0, X0 // p_sum_i = 0 + PXOR X1, X1 + PXOR X2, X2 + PXOR X3, X3 + PXOR X4, X4 + PXOR X5, X5 + PXOR X6, X6 + PXOR X7, X7 + CMPQ CX, $0 // if CX == 0 { return 0 } + JE absum_end + MOVQ CX, BX + ANDQ $7, BX // BX = n % 8 + SHRQ $3, CX // CX = floor( n / 8 ) + JZ absum_tail_start // if CX == 0 { goto absum_tail_start } + +absum_loop: // do { + // p_sum = max( p_sum + x[i], p_sum - x[i] ) + MOVSD (SI), X8 // X_i[0] = x[i] + MOVSD (SI)(AX*1), X9 + MOVSD (SI)(AX*2), X10 + MOVSD (SI)(DX*1), X11 + LEAQ (SI)(AX*4), SI // SI = SI + 4 + MOVHPD (SI), X8 // X_i[1] = x[i+4] + MOVHPD (SI)(AX*1), X9 + MOVHPD (SI)(AX*2), X10 + MOVHPD (SI)(DX*1), X11 + ADDPD X8, X0 // p_sum_i += X_i ( positive values ) + ADDPD X9, X2 + ADDPD X10, X4 + ADDPD X11, X6 + SUBPD X8, X1 // p_sum_(i+1) -= X_i ( negative values ) + SUBPD X9, X3 + SUBPD X10, X5 + SUBPD X11, X7 + MAXPD X1, X0 // p_sum_i = max( p_sum_i, p_sum_(i+1) ) + MAXPD X3, X2 + MAXPD X5, X4 + MAXPD X7, X6 + MOVAPS X0, X1 // p_sum_(i+1) = p_sum_i + MOVAPS X2, X3 + MOVAPS X4, X5 + MOVAPS X6, X7 + LEAQ (SI)(AX*4), SI // SI = SI + 4 + LOOP absum_loop // } while --CX > 0 + + // p_sum_0 = \sum_{i=1}^{3}( p_sum_(i*2) ) + ADDPD X3, X0 + ADDPD X5, X7 + ADDPD X7, X0 + + // p_sum_0[0] = p_sum_0[0] + p_sum_0[1] + MOVAPS X0, X1 + SHUFPD $0x3, X0, X0 // lower( p_sum_0 ) = upper( p_sum_0 ) + ADDSD X1, X0 + CMPQ BX, $0 + JE absum_end // if BX == 0 { goto absum_end } + +absum_tail_start: // Reset loop registers + MOVQ BX, CX // Loop counter: CX = BX + XORPS X8, X8 // X_8 = 0 + +absum_tail: // do { + // p_sum += max( p_sum + x[i], p_sum - x[i] ) + MOVSD (SI), X8 // X_8 = x[i] + MOVSD X0, X1 // p_sum_1 = p_sum_0 + ADDSD X8, X0 // p_sum_0 += X_8 + SUBSD X8, X1 // p_sum_1 -= X_8 + MAXSD X1, X0 // p_sum_0 = max( p_sum_0, p_sum_1 ) + ADDQ AX, SI // i++ + LOOP absum_tail // } while --CX > 0 + +absum_end: // return p_sum_0 + MOVSD X0, sum+40(FP) + RET diff --git a/vendor/gonum.org/v1/gonum/internal/asm/f64/add_amd64.s b/vendor/gonum.org/v1/gonum/internal/asm/f64/add_amd64.s new file mode 100644 index 00000000..e377f512 --- /dev/null +++ b/vendor/gonum.org/v1/gonum/internal/asm/f64/add_amd64.s @@ -0,0 +1,66 @@ +// Copyright ©2016 The Gonum Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// +build !noasm,!gccgo,!safe + +#include "textflag.h" + +// func Add(dst, s []float64) +TEXT ·Add(SB), NOSPLIT, $0 + MOVQ dst_base+0(FP), DI // DI = &dst + MOVQ dst_len+8(FP), CX // CX = len(dst) + MOVQ s_base+24(FP), SI // SI = &s + CMPQ s_len+32(FP), CX // CX = max( CX, len(s) ) + CMOVQLE s_len+32(FP), CX + CMPQ CX, $0 // if CX == 0 { return } + JE add_end + XORQ AX, AX + MOVQ DI, BX + ANDQ $0x0F, BX // BX = &dst & 15 + JZ add_no_trim // if BX == 0 { goto add_no_trim } + + // Align on 16-bit boundary + MOVSD (SI)(AX*8), X0 // X0 = s[i] + ADDSD (DI)(AX*8), X0 // X0 += dst[i] + MOVSD X0, (DI)(AX*8) // dst[i] = X0 + INCQ AX // i++ + DECQ CX // --CX + JE add_end // if CX == 0 { return } + +add_no_trim: + MOVQ CX, BX + ANDQ $7, BX // BX = len(dst) % 8 + SHRQ $3, CX // CX = floor( len(dst) / 8 ) + JZ add_tail_start // if CX == 0 { goto add_tail_start } + +add_loop: // Loop unrolled 8x do { + MOVUPS (SI)(AX*8), X0 // X_i = s[i:i+1] + MOVUPS 16(SI)(AX*8), X1 + MOVUPS 32(SI)(AX*8), X2 + MOVUPS 48(SI)(AX*8), X3 + ADDPD (DI)(AX*8), X0 // X_i += dst[i:i+1] + ADDPD 16(DI)(AX*8), X1 + ADDPD 32(DI)(AX*8), X2 + ADDPD 48(DI)(AX*8), X3 + MOVUPS X0, (DI)(AX*8) // dst[i:i+1] = X_i + MOVUPS X1, 16(DI)(AX*8) + MOVUPS X2, 32(DI)(AX*8) + MOVUPS X3, 48(DI)(AX*8) + ADDQ $8, AX // i += 8 + LOOP add_loop // } while --CX > 0 + CMPQ BX, $0 // if BX == 0 { return } + JE add_end + +add_tail_start: // Reset loop registers + MOVQ BX, CX // Loop counter: CX = BX + +add_tail: // do { + MOVSD (SI)(AX*8), X0 // X0 = s[i] + ADDSD (DI)(AX*8), X0 // X0 += dst[i] + MOVSD X0, (DI)(AX*8) // dst[i] = X0 + INCQ AX // ++i + LOOP add_tail // } while --CX > 0 + +add_end: + RET diff --git a/vendor/gonum.org/v1/gonum/internal/asm/f64/addconst_amd64.s b/vendor/gonum.org/v1/gonum/internal/asm/f64/addconst_amd64.s new file mode 100644 index 00000000..6f52a8f6 --- /dev/null +++ b/vendor/gonum.org/v1/gonum/internal/asm/f64/addconst_amd64.s @@ -0,0 +1,53 @@ +// Copyright ©2016 The Gonum Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// +build !noasm,!gccgo,!safe + +#include "textflag.h" + +// func Addconst(alpha float64, x []float64) +TEXT ·AddConst(SB), NOSPLIT, $0 + MOVQ x_base+8(FP), SI // SI = &x + MOVQ x_len+16(FP), CX // CX = len(x) + CMPQ CX, $0 // if len(x) == 0 { return } + JE ac_end + MOVSD alpha+0(FP), X4 // X4 = { a, a } + SHUFPD $0, X4, X4 + MOVUPS X4, X5 // X5 = X4 + XORQ AX, AX // i = 0 + MOVQ CX, BX + ANDQ $7, BX // BX = len(x) % 8 + SHRQ $3, CX // CX = floor( len(x) / 8 ) + JZ ac_tail_start // if CX == 0 { goto ac_tail_start } + +ac_loop: // Loop unrolled 8x do { + MOVUPS (SI)(AX*8), X0 // X_i = s[i:i+1] + MOVUPS 16(SI)(AX*8), X1 + MOVUPS 32(SI)(AX*8), X2 + MOVUPS 48(SI)(AX*8), X3 + ADDPD X4, X0 // X_i += a + ADDPD X5, X1 + ADDPD X4, X2 + ADDPD X5, X3 + MOVUPS X0, (SI)(AX*8) // s[i:i+1] = X_i + MOVUPS X1, 16(SI)(AX*8) + MOVUPS X2, 32(SI)(AX*8) + MOVUPS X3, 48(SI)(AX*8) + ADDQ $8, AX // i += 8 + LOOP ac_loop // } while --CX > 0 + CMPQ BX, $0 // if BX == 0 { return } + JE ac_end + +ac_tail_start: // Reset loop counters + MOVQ BX, CX // Loop counter: CX = BX + +ac_tail: // do { + MOVSD (SI)(AX*8), X0 // X0 = s[i] + ADDSD X4, X0 // X0 += a + MOVSD X0, (SI)(AX*8) // s[i] = X0 + INCQ AX // ++i + LOOP ac_tail // } while --CX > 0 + +ac_end: + RET diff --git a/vendor/gonum.org/v1/gonum/internal/asm/f64/axpy.go b/vendor/gonum.org/v1/gonum/internal/asm/f64/axpy.go new file mode 100644 index 00000000..2ab8129a --- /dev/null +++ b/vendor/gonum.org/v1/gonum/internal/asm/f64/axpy.go @@ -0,0 +1,62 @@ +// Copyright ©2015 The Gonum Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +//go:build !amd64 || noasm || gccgo || safe +// +build !amd64 noasm gccgo safe + +package f64 + +// AxpyUnitary is +// +// for i, v := range x { +// y[i] += alpha * v +// } +func AxpyUnitary(alpha float64, x, y []float64) { + for i, v := range x { + y[i] += alpha * v + } +} + +// AxpyUnitaryTo is +// +// for i, v := range x { +// dst[i] = alpha*v + y[i] +// } +func AxpyUnitaryTo(dst []float64, alpha float64, x, y []float64) { + for i, v := range x { + dst[i] = alpha*v + y[i] + } +} + +// AxpyInc is +// +// for i := 0; i < int(n); i++ { +// y[iy] += alpha * x[ix] +// ix += incX +// iy += incY +// } +func AxpyInc(alpha float64, x, y []float64, n, incX, incY, ix, iy uintptr) { + for i := 0; i < int(n); i++ { + y[iy] += alpha * x[ix] + ix += incX + iy += incY + } +} + +// AxpyIncTo is +// +// for i := 0; i < int(n); i++ { +// dst[idst] = alpha*x[ix] + y[iy] +// ix += incX +// iy += incY +// idst += incDst +// } +func AxpyIncTo(dst []float64, incDst, idst uintptr, alpha float64, x, y []float64, n, incX, incY, ix, iy uintptr) { + for i := 0; i < int(n); i++ { + dst[idst] = alpha*x[ix] + y[iy] + ix += incX + iy += incY + idst += incDst + } +} diff --git a/vendor/gonum.org/v1/gonum/internal/asm/f64/axpyinc_amd64.s b/vendor/gonum.org/v1/gonum/internal/asm/f64/axpyinc_amd64.s new file mode 100644 index 00000000..a4e180fb --- /dev/null +++ b/vendor/gonum.org/v1/gonum/internal/asm/f64/axpyinc_amd64.s @@ -0,0 +1,142 @@ +// Copyright ©2015 The Gonum Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. +// +// Some of the loop unrolling code is copied from: +// http://golang.org/src/math/big/arith_amd64.s +// which is distributed under these terms: +// +// Copyright (c) 2012 The Go Authors. All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +// +build !noasm,!gccgo,!safe + +#include "textflag.h" + +#define X_PTR SI +#define Y_PTR DI +#define DST_PTR DI +#define IDX AX +#define LEN CX +#define TAIL BX +#define INC_X R8 +#define INCx3_X R11 +#define INC_Y R9 +#define INCx3_Y R12 +#define INC_DST R9 +#define INCx3_DST R12 +#define ALPHA X0 +#define ALPHA_2 X1 + +// func AxpyInc(alpha float64, x, y []float64, n, incX, incY, ix, iy uintptr) +TEXT ·AxpyInc(SB), NOSPLIT, $0 + MOVQ x_base+8(FP), X_PTR // X_PTR = &x + MOVQ y_base+32(FP), Y_PTR // Y_PTR = &y + MOVQ n+56(FP), LEN // LEN = n + CMPQ LEN, $0 // if LEN == 0 { return } + JE end + + MOVQ ix+80(FP), INC_X + MOVQ iy+88(FP), INC_Y + LEAQ (X_PTR)(INC_X*8), X_PTR // X_PTR = &(x[ix]) + LEAQ (Y_PTR)(INC_Y*8), Y_PTR // Y_PTR = &(y[iy]) + MOVQ Y_PTR, DST_PTR // DST_PTR = Y_PTR // Write pointer + + MOVQ incX+64(FP), INC_X // INC_X = incX * sizeof(float64) + SHLQ $3, INC_X + MOVQ incY+72(FP), INC_Y // INC_Y = incY * sizeof(float64) + SHLQ $3, INC_Y + + MOVSD alpha+0(FP), ALPHA // ALPHA = alpha + MOVQ LEN, TAIL + ANDQ $3, TAIL // TAIL = n % 4 + SHRQ $2, LEN // LEN = floor( n / 4 ) + JZ tail_start // if LEN == 0 { goto tail_start } + + MOVAPS ALPHA, ALPHA_2 // ALPHA_2 = ALPHA for pipelining + LEAQ (INC_X)(INC_X*2), INCx3_X // INCx3_X = INC_X * 3 + LEAQ (INC_Y)(INC_Y*2), INCx3_Y // INCx3_Y = INC_Y * 3 + +loop: // do { // y[i] += alpha * x[i] unrolled 4x. + MOVSD (X_PTR), X2 // X_i = x[i] + MOVSD (X_PTR)(INC_X*1), X3 + MOVSD (X_PTR)(INC_X*2), X4 + MOVSD (X_PTR)(INCx3_X*1), X5 + + MULSD ALPHA, X2 // X_i *= a + MULSD ALPHA_2, X3 + MULSD ALPHA, X4 + MULSD ALPHA_2, X5 + + ADDSD (Y_PTR), X2 // X_i += y[i] + ADDSD (Y_PTR)(INC_Y*1), X3 + ADDSD (Y_PTR)(INC_Y*2), X4 + ADDSD (Y_PTR)(INCx3_Y*1), X5 + + MOVSD X2, (DST_PTR) // y[i] = X_i + MOVSD X3, (DST_PTR)(INC_DST*1) + MOVSD X4, (DST_PTR)(INC_DST*2) + MOVSD X5, (DST_PTR)(INCx3_DST*1) + + LEAQ (X_PTR)(INC_X*4), X_PTR // X_PTR = &(X_PTR[incX*4]) + LEAQ (Y_PTR)(INC_Y*4), Y_PTR // Y_PTR = &(Y_PTR[incY*4]) + DECQ LEN + JNZ loop // } while --LEN > 0 + CMPQ TAIL, $0 // if TAIL == 0 { return } + JE end + +tail_start: // Reset Loop registers + MOVQ TAIL, LEN // Loop counter: LEN = TAIL + SHRQ $1, LEN // LEN = floor( LEN / 2 ) + JZ tail_one + +tail_two: + MOVSD (X_PTR), X2 // X_i = x[i] + MOVSD (X_PTR)(INC_X*1), X3 + MULSD ALPHA, X2 // X_i *= a + MULSD ALPHA, X3 + ADDSD (Y_PTR), X2 // X_i += y[i] + ADDSD (Y_PTR)(INC_Y*1), X3 + MOVSD X2, (DST_PTR) // y[i] = X_i + MOVSD X3, (DST_PTR)(INC_DST*1) + + LEAQ (X_PTR)(INC_X*2), X_PTR // X_PTR = &(X_PTR[incX*2]) + LEAQ (Y_PTR)(INC_Y*2), Y_PTR // Y_PTR = &(Y_PTR[incY*2]) + + ANDQ $1, TAIL + JZ end // if TAIL == 0 { goto end } + +tail_one: + // y[i] += alpha * x[i] for the last n % 4 iterations. + MOVSD (X_PTR), X2 // X2 = x[i] + MULSD ALPHA, X2 // X2 *= a + ADDSD (Y_PTR), X2 // X2 += y[i] + MOVSD X2, (DST_PTR) // y[i] = X2 + +end: + RET diff --git a/vendor/gonum.org/v1/gonum/internal/asm/f64/axpyincto_amd64.s b/vendor/gonum.org/v1/gonum/internal/asm/f64/axpyincto_amd64.s new file mode 100644 index 00000000..0f54a394 --- /dev/null +++ b/vendor/gonum.org/v1/gonum/internal/asm/f64/axpyincto_amd64.s @@ -0,0 +1,148 @@ +// Copyright ©2015 The Gonum Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. +// +// Some of the loop unrolling code is copied from: +// http://golang.org/src/math/big/arith_amd64.s +// which is distributed under these terms: +// +// Copyright (c) 2012 The Go Authors. All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +// +build !noasm,!gccgo,!safe + +#include "textflag.h" + +#define X_PTR SI +#define Y_PTR DI +#define DST_PTR DX +#define IDX AX +#define LEN CX +#define TAIL BX +#define INC_X R8 +#define INCx3_X R11 +#define INC_Y R9 +#define INCx3_Y R12 +#define INC_DST R10 +#define INCx3_DST R13 +#define ALPHA X0 +#define ALPHA_2 X1 + +// func AxpyIncTo(dst []float64, incDst, idst uintptr, alpha float64, x, y []float64, n, incX, incY, ix, iy uintptr) +TEXT ·AxpyIncTo(SB), NOSPLIT, $0 + MOVQ dst_base+0(FP), DST_PTR // DST_PTR := &dst + MOVQ x_base+48(FP), X_PTR // X_PTR := &x + MOVQ y_base+72(FP), Y_PTR // Y_PTR := &y + MOVQ n+96(FP), LEN // LEN := n + CMPQ LEN, $0 // if LEN == 0 { return } + JE end + + MOVQ ix+120(FP), INC_X + LEAQ (X_PTR)(INC_X*8), X_PTR // X_PTR = &(x[ix]) + MOVQ iy+128(FP), INC_Y + LEAQ (Y_PTR)(INC_Y*8), Y_PTR // Y_PTR = &(dst[idst]) + MOVQ idst+32(FP), INC_DST + LEAQ (DST_PTR)(INC_DST*8), DST_PTR // DST_PTR = &(y[iy]) + + MOVQ incX+104(FP), INC_X // INC_X = incX * sizeof(float64) + SHLQ $3, INC_X + MOVQ incY+112(FP), INC_Y // INC_Y = incY * sizeof(float64) + SHLQ $3, INC_Y + MOVQ incDst+24(FP), INC_DST // INC_DST = incDst * sizeof(float64) + SHLQ $3, INC_DST + MOVSD alpha+40(FP), ALPHA + + MOVQ LEN, TAIL + ANDQ $3, TAIL // TAIL = n % 4 + SHRQ $2, LEN // LEN = floor( n / 4 ) + JZ tail_start // if LEN == 0 { goto tail_start } + + MOVSD ALPHA, ALPHA_2 // ALPHA_2 = ALPHA for pipelining + LEAQ (INC_X)(INC_X*2), INCx3_X // INCx3_X = INC_X * 3 + LEAQ (INC_Y)(INC_Y*2), INCx3_Y // INCx3_Y = INC_Y * 3 + LEAQ (INC_DST)(INC_DST*2), INCx3_DST // INCx3_DST = INC_DST * 3 + +loop: // do { // y[i] += alpha * x[i] unrolled 2x. + MOVSD (X_PTR), X2 // X_i = x[i] + MOVSD (X_PTR)(INC_X*1), X3 + MOVSD (X_PTR)(INC_X*2), X4 + MOVSD (X_PTR)(INCx3_X*1), X5 + + MULSD ALPHA, X2 // X_i *= a + MULSD ALPHA_2, X3 + MULSD ALPHA, X4 + MULSD ALPHA_2, X5 + + ADDSD (Y_PTR), X2 // X_i += y[i] + ADDSD (Y_PTR)(INC_Y*1), X3 + ADDSD (Y_PTR)(INC_Y*2), X4 + ADDSD (Y_PTR)(INCx3_Y*1), X5 + + MOVSD X2, (DST_PTR) // y[i] = X_i + MOVSD X3, (DST_PTR)(INC_DST*1) + MOVSD X4, (DST_PTR)(INC_DST*2) + MOVSD X5, (DST_PTR)(INCx3_DST*1) + + LEAQ (X_PTR)(INC_X*4), X_PTR // X_PTR = &(X_PTR[incX*4]) + LEAQ (Y_PTR)(INC_Y*4), Y_PTR // Y_PTR = &(Y_PTR[incY*4]) + LEAQ (DST_PTR)(INC_DST*4), DST_PTR // DST_PTR = &(DST_PTR[incDst*4] + DECQ LEN + JNZ loop // } while --LEN > 0 + CMPQ TAIL, $0 // if TAIL == 0 { return } + JE end + +tail_start: // Reset Loop registers + MOVQ TAIL, LEN // Loop counter: LEN = TAIL + SHRQ $1, LEN // LEN = floor( LEN / 2 ) + JZ tail_one + +tail_two: + MOVSD (X_PTR), X2 // X_i = x[i] + MOVSD (X_PTR)(INC_X*1), X3 + MULSD ALPHA, X2 // X_i *= a + MULSD ALPHA, X3 + ADDSD (Y_PTR), X2 // X_i += y[i] + ADDSD (Y_PTR)(INC_Y*1), X3 + MOVSD X2, (DST_PTR) // y[i] = X_i + MOVSD X3, (DST_PTR)(INC_DST*1) + + LEAQ (X_PTR)(INC_X*2), X_PTR // X_PTR = &(X_PTR[incX*2]) + LEAQ (Y_PTR)(INC_Y*2), Y_PTR // Y_PTR = &(Y_PTR[incY*2]) + LEAQ (DST_PTR)(INC_DST*2), DST_PTR // DST_PTR = &(DST_PTR[incY*2] + + ANDQ $1, TAIL + JZ end // if TAIL == 0 { goto end } + +tail_one: + MOVSD (X_PTR), X2 // X2 = x[i] + MULSD ALPHA, X2 // X2 *= a + ADDSD (Y_PTR), X2 // X2 += y[i] + MOVSD X2, (DST_PTR) // y[i] = X2 + +end: + RET diff --git a/vendor/gonum.org/v1/gonum/internal/asm/f64/axpyunitary_amd64.s b/vendor/gonum.org/v1/gonum/internal/asm/f64/axpyunitary_amd64.s new file mode 100644 index 00000000..f0b78596 --- /dev/null +++ b/vendor/gonum.org/v1/gonum/internal/asm/f64/axpyunitary_amd64.s @@ -0,0 +1,134 @@ +// Copyright ©2015 The Gonum Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. +// +// Some of the loop unrolling code is copied from: +// http://golang.org/src/math/big/arith_amd64.s +// which is distributed under these terms: +// +// Copyright (c) 2012 The Go Authors. All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +// +build !noasm,!gccgo,!safe + +#include "textflag.h" + +#define X_PTR SI +#define Y_PTR DI +#define DST_PTR DI +#define IDX AX +#define LEN CX +#define TAIL BX +#define ALPHA X0 +#define ALPHA_2 X1 + +// func AxpyUnitary(alpha float64, x, y []float64) +TEXT ·AxpyUnitary(SB), NOSPLIT, $0 + MOVQ x_base+8(FP), X_PTR // X_PTR := &x + MOVQ y_base+32(FP), Y_PTR // Y_PTR := &y + MOVQ x_len+16(FP), LEN // LEN = min( len(x), len(y) ) + CMPQ y_len+40(FP), LEN + CMOVQLE y_len+40(FP), LEN + CMPQ LEN, $0 // if LEN == 0 { return } + JE end + XORQ IDX, IDX + MOVSD alpha+0(FP), ALPHA // ALPHA := { alpha, alpha } + SHUFPD $0, ALPHA, ALPHA + MOVUPS ALPHA, ALPHA_2 // ALPHA_2 := ALPHA for pipelining + MOVQ Y_PTR, TAIL // Check memory alignment + ANDQ $15, TAIL // TAIL = &y % 16 + JZ no_trim // if TAIL == 0 { goto no_trim } + + // Align on 16-byte boundary + MOVSD (X_PTR), X2 // X2 := x[0] + MULSD ALPHA, X2 // X2 *= a + ADDSD (Y_PTR), X2 // X2 += y[0] + MOVSD X2, (DST_PTR) // y[0] = X2 + INCQ IDX // i++ + DECQ LEN // LEN-- + JZ end // if LEN == 0 { return } + +no_trim: + MOVQ LEN, TAIL + ANDQ $7, TAIL // TAIL := n % 8 + SHRQ $3, LEN // LEN = floor( n / 8 ) + JZ tail_start // if LEN == 0 { goto tail2_start } + +loop: // do { + // y[i] += alpha * x[i] unrolled 8x. + MOVUPS (X_PTR)(IDX*8), X2 // X_i = x[i] + MOVUPS 16(X_PTR)(IDX*8), X3 + MOVUPS 32(X_PTR)(IDX*8), X4 + MOVUPS 48(X_PTR)(IDX*8), X5 + + MULPD ALPHA, X2 // X_i *= a + MULPD ALPHA_2, X3 + MULPD ALPHA, X4 + MULPD ALPHA_2, X5 + + ADDPD (Y_PTR)(IDX*8), X2 // X_i += y[i] + ADDPD 16(Y_PTR)(IDX*8), X3 + ADDPD 32(Y_PTR)(IDX*8), X4 + ADDPD 48(Y_PTR)(IDX*8), X5 + + MOVUPS X2, (DST_PTR)(IDX*8) // y[i] = X_i + MOVUPS X3, 16(DST_PTR)(IDX*8) + MOVUPS X4, 32(DST_PTR)(IDX*8) + MOVUPS X5, 48(DST_PTR)(IDX*8) + + ADDQ $8, IDX // i += 8 + DECQ LEN + JNZ loop // } while --LEN > 0 + CMPQ TAIL, $0 // if TAIL == 0 { return } + JE end + +tail_start: // Reset loop registers + MOVQ TAIL, LEN // Loop counter: LEN = TAIL + SHRQ $1, LEN // LEN = floor( TAIL / 2 ) + JZ tail_one // if TAIL == 0 { goto tail } + +tail_two: // do { + MOVUPS (X_PTR)(IDX*8), X2 // X2 = x[i] + MULPD ALPHA, X2 // X2 *= a + ADDPD (Y_PTR)(IDX*8), X2 // X2 += y[i] + MOVUPS X2, (DST_PTR)(IDX*8) // y[i] = X2 + ADDQ $2, IDX // i += 2 + DECQ LEN + JNZ tail_two // } while --LEN > 0 + + ANDQ $1, TAIL + JZ end // if TAIL == 0 { goto end } + +tail_one: + MOVSD (X_PTR)(IDX*8), X2 // X2 = x[i] + MULSD ALPHA, X2 // X2 *= a + ADDSD (Y_PTR)(IDX*8), X2 // X2 += y[i] + MOVSD X2, (DST_PTR)(IDX*8) // y[i] = X2 + +end: + RET diff --git a/vendor/gonum.org/v1/gonum/internal/asm/f64/axpyunitaryto_amd64.s b/vendor/gonum.org/v1/gonum/internal/asm/f64/axpyunitaryto_amd64.s new file mode 100644 index 00000000..dbb0a7ea --- /dev/null +++ b/vendor/gonum.org/v1/gonum/internal/asm/f64/axpyunitaryto_amd64.s @@ -0,0 +1,140 @@ +// Copyright ©2015 The Gonum Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. +// +// Some of the loop unrolling code is copied from: +// http://golang.org/src/math/big/arith_amd64.s +// which is distributed under these terms: +// +// Copyright (c) 2012 The Go Authors. All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +// +build !noasm,!gccgo,!safe + +#include "textflag.h" + +#define X_PTR SI +#define Y_PTR DX +#define DST_PTR DI +#define IDX AX +#define LEN CX +#define TAIL BX +#define ALPHA X0 +#define ALPHA_2 X1 + +// func AxpyUnitaryTo(dst []float64, alpha float64, x, y []float64) +TEXT ·AxpyUnitaryTo(SB), NOSPLIT, $0 + MOVQ dst_base+0(FP), DST_PTR // DST_PTR := &dst + MOVQ x_base+32(FP), X_PTR // X_PTR := &x + MOVQ y_base+56(FP), Y_PTR // Y_PTR := &y + MOVQ x_len+40(FP), LEN // LEN = min( len(x), len(y), len(dst) ) + CMPQ y_len+64(FP), LEN + CMOVQLE y_len+64(FP), LEN + CMPQ dst_len+8(FP), LEN + CMOVQLE dst_len+8(FP), LEN + + CMPQ LEN, $0 + JE end // if LEN == 0 { return } + + XORQ IDX, IDX // IDX = 0 + MOVSD alpha+24(FP), ALPHA + SHUFPD $0, ALPHA, ALPHA // ALPHA := { alpha, alpha } + MOVQ Y_PTR, TAIL // Check memory alignment + ANDQ $15, TAIL // TAIL = &y % 16 + JZ no_trim // if TAIL == 0 { goto no_trim } + + // Align on 16-byte boundary + MOVSD (X_PTR), X2 // X2 := x[0] + MULSD ALPHA, X2 // X2 *= a + ADDSD (Y_PTR), X2 // X2 += y[0] + MOVSD X2, (DST_PTR) // y[0] = X2 + INCQ IDX // i++ + DECQ LEN // LEN-- + JZ end // if LEN == 0 { return } + +no_trim: + MOVQ LEN, TAIL + ANDQ $7, TAIL // TAIL := n % 8 + SHRQ $3, LEN // LEN = floor( n / 8 ) + JZ tail_start // if LEN == 0 { goto tail_start } + + MOVUPS ALPHA, ALPHA_2 // ALPHA_2 := ALPHA for pipelining + +loop: // do { + // y[i] += alpha * x[i] unrolled 8x. + MOVUPS (X_PTR)(IDX*8), X2 // X_i = x[i] + MOVUPS 16(X_PTR)(IDX*8), X3 + MOVUPS 32(X_PTR)(IDX*8), X4 + MOVUPS 48(X_PTR)(IDX*8), X5 + + MULPD ALPHA, X2 // X_i *= alpha + MULPD ALPHA_2, X3 + MULPD ALPHA, X4 + MULPD ALPHA_2, X5 + + ADDPD (Y_PTR)(IDX*8), X2 // X_i += y[i] + ADDPD 16(Y_PTR)(IDX*8), X3 + ADDPD 32(Y_PTR)(IDX*8), X4 + ADDPD 48(Y_PTR)(IDX*8), X5 + + MOVUPS X2, (DST_PTR)(IDX*8) // y[i] = X_i + MOVUPS X3, 16(DST_PTR)(IDX*8) + MOVUPS X4, 32(DST_PTR)(IDX*8) + MOVUPS X5, 48(DST_PTR)(IDX*8) + + ADDQ $8, IDX // i += 8 + DECQ LEN + JNZ loop // } while --LEN > 0 + CMPQ TAIL, $0 // if TAIL == 0 { return } + JE end + +tail_start: // Reset loop registers + MOVQ TAIL, LEN // Loop counter: LEN = TAIL + SHRQ $1, LEN // LEN = floor( TAIL / 2 ) + JZ tail_one // if LEN == 0 { goto tail } + +tail_two: // do { + MOVUPS (X_PTR)(IDX*8), X2 // X2 = x[i] + MULPD ALPHA, X2 // X2 *= alpha + ADDPD (Y_PTR)(IDX*8), X2 // X2 += y[i] + MOVUPS X2, (DST_PTR)(IDX*8) // y[i] = X2 + ADDQ $2, IDX // i += 2 + DECQ LEN + JNZ tail_two // } while --LEN > 0 + + ANDQ $1, TAIL + JZ end // if TAIL == 0 { goto end } + +tail_one: + MOVSD (X_PTR)(IDX*8), X2 // X2 = x[i] + MULSD ALPHA, X2 // X2 *= a + ADDSD (Y_PTR)(IDX*8), X2 // X2 += y[i] + MOVSD X2, (DST_PTR)(IDX*8) // y[i] = X2 + +end: + RET diff --git a/vendor/gonum.org/v1/gonum/internal/asm/f64/cumprod_amd64.s b/vendor/gonum.org/v1/gonum/internal/asm/f64/cumprod_amd64.s new file mode 100644 index 00000000..58168482 --- /dev/null +++ b/vendor/gonum.org/v1/gonum/internal/asm/f64/cumprod_amd64.s @@ -0,0 +1,71 @@ +// Copyright ©2016 The Gonum Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// +build !noasm,!gccgo,!safe + +#include "textflag.h" + +TEXT ·CumProd(SB), NOSPLIT, $0 + MOVQ dst_base+0(FP), DI // DI = &dst + MOVQ dst_len+8(FP), CX // CX = len(dst) + MOVQ s_base+24(FP), SI // SI = &s + CMPQ s_len+32(FP), CX // CX = max( CX, len(s) ) + CMOVQLE s_len+32(FP), CX + MOVQ CX, ret_len+56(FP) // len(ret) = CX + CMPQ CX, $0 // if CX == 0 { return } + JE cp_end + XORQ AX, AX // i = 0 + + MOVSD (SI), X5 // p_prod = { s[0], s[0] } + SHUFPD $0, X5, X5 + MOVSD X5, (DI) // dst[0] = s[0] + INCQ AX // ++i + DECQ CX // -- CX + JZ cp_end // if CX == 0 { return } + + MOVQ CX, BX + ANDQ $3, BX // BX = CX % 4 + SHRQ $2, CX // CX = floor( CX / 4 ) + JZ cp_tail_start // if CX == 0 { goto cp_tail_start } + +cp_loop: // Loop unrolled 4x do { + MOVUPS (SI)(AX*8), X0 // X0 = s[i:i+1] + MOVUPS 16(SI)(AX*8), X2 + MOVAPS X0, X1 // X1 = X0 + MOVAPS X2, X3 + SHUFPD $1, X1, X1 // { X1[0], X1[1] } = { X1[1], X1[0] } + SHUFPD $1, X3, X3 + MULPD X0, X1 // X1 *= X0 + MULPD X2, X3 + SHUFPD $2, X1, X0 // { X0[0], X0[1] } = { X0[0], X1[1] } + SHUFPD $3, X1, X1 // { X1[0], X1[1] } = { X1[1], X1[1] } + SHUFPD $2, X3, X2 + SHUFPD $3, X3, X3 + MULPD X5, X0 // X0 *= p_prod + MULPD X1, X5 // p_prod *= X1 + MULPD X5, X2 + MOVUPS X0, (DI)(AX*8) // dst[i] = X0 + MOVUPS X2, 16(DI)(AX*8) + MULPD X3, X5 + ADDQ $4, AX // i += 4 + LOOP cp_loop // } while --CX > 0 + + // if BX == 0 { return } + CMPQ BX, $0 + JE cp_end + +cp_tail_start: // Reset loop registers + MOVQ BX, CX // Loop counter: CX = BX + +cp_tail: // do { + MULSD (SI)(AX*8), X5 // p_prod *= s[i] + MOVSD X5, (DI)(AX*8) // dst[i] = p_prod + INCQ AX // ++i + LOOP cp_tail // } while --CX > 0 + +cp_end: + MOVQ DI, ret_base+48(FP) // &ret = &dst + MOVQ dst_cap+16(FP), SI // cap(ret) = cap(dst) + MOVQ SI, ret_cap+64(FP) + RET diff --git a/vendor/gonum.org/v1/gonum/internal/asm/f64/cumsum_amd64.s b/vendor/gonum.org/v1/gonum/internal/asm/f64/cumsum_amd64.s new file mode 100644 index 00000000..85613202 --- /dev/null +++ b/vendor/gonum.org/v1/gonum/internal/asm/f64/cumsum_amd64.s @@ -0,0 +1,64 @@ +// Copyright ©2016 The Gonum Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// +build !noasm,!gccgo,!safe + +#include "textflag.h" + +TEXT ·CumSum(SB), NOSPLIT, $0 + MOVQ dst_base+0(FP), DI // DI = &dst + MOVQ dst_len+8(FP), CX // CX = len(dst) + MOVQ s_base+24(FP), SI // SI = &s + CMPQ s_len+32(FP), CX // CX = max( CX, len(s) ) + CMOVQLE s_len+32(FP), CX + MOVQ CX, ret_len+56(FP) // len(ret) = CX + CMPQ CX, $0 // if CX == 0 { return } + JE cs_end + XORQ AX, AX // i = 0 + PXOR X5, X5 // p_sum = 0 + MOVQ CX, BX + ANDQ $3, BX // BX = CX % 4 + SHRQ $2, CX // CX = floor( CX / 4 ) + JZ cs_tail_start // if CX == 0 { goto cs_tail_start } + +cs_loop: // Loop unrolled 4x do { + MOVUPS (SI)(AX*8), X0 // X0 = s[i:i+1] + MOVUPS 16(SI)(AX*8), X2 + MOVAPS X0, X1 // X1 = X0 + MOVAPS X2, X3 + SHUFPD $1, X1, X1 // { X1[0], X1[1] } = { X1[1], X1[0] } + SHUFPD $1, X3, X3 + ADDPD X0, X1 // X1 += X0 + ADDPD X2, X3 + SHUFPD $2, X1, X0 // { X0[0], X0[1] } = { X0[0], X1[1] } + SHUFPD $3, X1, X1 // { X1[0], X1[1] } = { X1[1], X1[1] } + SHUFPD $2, X3, X2 + SHUFPD $3, X3, X3 + ADDPD X5, X0 // X0 += p_sum + ADDPD X1, X5 // p_sum += X1 + ADDPD X5, X2 + MOVUPS X0, (DI)(AX*8) // dst[i] = X0 + MOVUPS X2, 16(DI)(AX*8) + ADDPD X3, X5 + ADDQ $4, AX // i += 4 + LOOP cs_loop // } while --CX > 0 + + // if BX == 0 { return } + CMPQ BX, $0 + JE cs_end + +cs_tail_start: // Reset loop registers + MOVQ BX, CX // Loop counter: CX = BX + +cs_tail: // do { + ADDSD (SI)(AX*8), X5 // p_sum *= s[i] + MOVSD X5, (DI)(AX*8) // dst[i] = p_sum + INCQ AX // ++i + LOOP cs_tail // } while --CX > 0 + +cs_end: + MOVQ DI, ret_base+48(FP) // &ret = &dst + MOVQ dst_cap+16(FP), SI // cap(ret) = cap(dst) + MOVQ SI, ret_cap+64(FP) + RET diff --git a/vendor/gonum.org/v1/gonum/internal/asm/f64/div_amd64.s b/vendor/gonum.org/v1/gonum/internal/asm/f64/div_amd64.s new file mode 100644 index 00000000..95839767 --- /dev/null +++ b/vendor/gonum.org/v1/gonum/internal/asm/f64/div_amd64.s @@ -0,0 +1,67 @@ +// Copyright ©2016 The Gonum Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// +build !noasm,!gccgo,!safe + +#include "textflag.h" + +// func Div(dst, s []float64) +TEXT ·Div(SB), NOSPLIT, $0 + MOVQ dst_base+0(FP), DI // DI = &dst + MOVQ dst_len+8(FP), CX // CX = len(dst) + MOVQ s_base+24(FP), SI // SI = &s + CMPQ s_len+32(FP), CX // CX = max( CX, len(s) ) + CMOVQLE s_len+32(FP), CX + CMPQ CX, $0 // if CX == 0 { return } + JE div_end + XORQ AX, AX // i = 0 + MOVQ SI, BX + ANDQ $15, BX // BX = &s & 15 + JZ div_no_trim // if BX == 0 { goto div_no_trim } + + // Align on 16-bit boundary + MOVSD (DI)(AX*8), X0 // X0 = dst[i] + DIVSD (SI)(AX*8), X0 // X0 /= s[i] + MOVSD X0, (DI)(AX*8) // dst[i] = X0 + INCQ AX // ++i + DECQ CX // --CX + JZ div_end // if CX == 0 { return } + +div_no_trim: + MOVQ CX, BX + ANDQ $7, BX // BX = len(dst) % 8 + SHRQ $3, CX // CX = floor( len(dst) / 8 ) + JZ div_tail_start // if CX == 0 { goto div_tail_start } + +div_loop: // Loop unrolled 8x do { + MOVUPS (DI)(AX*8), X0 // X0 = dst[i:i+1] + MOVUPS 16(DI)(AX*8), X1 + MOVUPS 32(DI)(AX*8), X2 + MOVUPS 48(DI)(AX*8), X3 + DIVPD (SI)(AX*8), X0 // X0 /= s[i:i+1] + DIVPD 16(SI)(AX*8), X1 + DIVPD 32(SI)(AX*8), X2 + DIVPD 48(SI)(AX*8), X3 + MOVUPS X0, (DI)(AX*8) // dst[i] = X0 + MOVUPS X1, 16(DI)(AX*8) + MOVUPS X2, 32(DI)(AX*8) + MOVUPS X3, 48(DI)(AX*8) + ADDQ $8, AX // i += 8 + LOOP div_loop // } while --CX > 0 + CMPQ BX, $0 // if BX == 0 { return } + JE div_end + +div_tail_start: // Reset loop registers + MOVQ BX, CX // Loop counter: CX = BX + +div_tail: // do { + MOVSD (DI)(AX*8), X0 // X0 = dst[i] + DIVSD (SI)(AX*8), X0 // X0 /= s[i] + MOVSD X0, (DI)(AX*8) // dst[i] = X0 + INCQ AX // ++i + LOOP div_tail // } while --CX > 0 + +div_end: + RET + diff --git a/vendor/gonum.org/v1/gonum/internal/asm/f64/divto_amd64.s b/vendor/gonum.org/v1/gonum/internal/asm/f64/divto_amd64.s new file mode 100644 index 00000000..e7094cb9 --- /dev/null +++ b/vendor/gonum.org/v1/gonum/internal/asm/f64/divto_amd64.s @@ -0,0 +1,73 @@ +// Copyright ©2016 The Gonum Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// +build !noasm,!gccgo,!safe + +#include "textflag.h" + +// func DivTo(dst, x, y []float64) +TEXT ·DivTo(SB), NOSPLIT, $0 + MOVQ dst_base+0(FP), DI // DI = &dst + MOVQ dst_len+8(FP), CX // CX = len(dst) + MOVQ x_base+24(FP), SI // SI = &x + MOVQ y_base+48(FP), DX // DX = &y + CMPQ x_len+32(FP), CX // CX = max( len(dst), len(x), len(y) ) + CMOVQLE x_len+32(FP), CX + CMPQ y_len+56(FP), CX + CMOVQLE y_len+56(FP), CX + MOVQ CX, ret_len+80(FP) // len(ret) = CX + CMPQ CX, $0 // if CX == 0 { return } + JE div_end + XORQ AX, AX // i = 0 + MOVQ DX, BX + ANDQ $15, BX // BX = &y & OxF + JZ div_no_trim // if BX == 0 { goto div_no_trim } + + // Align on 16-bit boundary + MOVSD (SI)(AX*8), X0 // X0 = s[i] + DIVSD (DX)(AX*8), X0 // X0 /= t[i] + MOVSD X0, (DI)(AX*8) // dst[i] = X0 + INCQ AX // ++i + DECQ CX // --CX + JZ div_end // if CX == 0 { return } + +div_no_trim: + MOVQ CX, BX + ANDQ $7, BX // BX = len(dst) % 8 + SHRQ $3, CX // CX = floor( len(dst) / 8 ) + JZ div_tail_start // if CX == 0 { goto div_tail_start } + +div_loop: // Loop unrolled 8x do { + MOVUPS (SI)(AX*8), X0 // X0 = x[i:i+1] + MOVUPS 16(SI)(AX*8), X1 + MOVUPS 32(SI)(AX*8), X2 + MOVUPS 48(SI)(AX*8), X3 + DIVPD (DX)(AX*8), X0 // X0 /= y[i:i+1] + DIVPD 16(DX)(AX*8), X1 + DIVPD 32(DX)(AX*8), X2 + DIVPD 48(DX)(AX*8), X3 + MOVUPS X0, (DI)(AX*8) // dst[i:i+1] = X0 + MOVUPS X1, 16(DI)(AX*8) + MOVUPS X2, 32(DI)(AX*8) + MOVUPS X3, 48(DI)(AX*8) + ADDQ $8, AX // i += 8 + LOOP div_loop // } while --CX > 0 + CMPQ BX, $0 // if BX == 0 { return } + JE div_end + +div_tail_start: // Reset loop registers + MOVQ BX, CX // Loop counter: CX = BX + +div_tail: // do { + MOVSD (SI)(AX*8), X0 // X0 = x[i] + DIVSD (DX)(AX*8), X0 // X0 /= y[i] + MOVSD X0, (DI)(AX*8) + INCQ AX // ++i + LOOP div_tail // } while --CX > 0 + +div_end: + MOVQ DI, ret_base+72(FP) // &ret = &dst + MOVQ dst_cap+16(FP), DI // cap(ret) = cap(dst) + MOVQ DI, ret_cap+88(FP) + RET diff --git a/vendor/gonum.org/v1/gonum/internal/asm/f64/doc.go b/vendor/gonum.org/v1/gonum/internal/asm/f64/doc.go new file mode 100644 index 00000000..33c76c1e --- /dev/null +++ b/vendor/gonum.org/v1/gonum/internal/asm/f64/doc.go @@ -0,0 +1,6 @@ +// Copyright ©2017 The Gonum Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Package f64 provides float64 vector primitives. +package f64 // import "gonum.org/v1/gonum/internal/asm/f64" diff --git a/vendor/gonum.org/v1/gonum/internal/asm/f64/dot.go b/vendor/gonum.org/v1/gonum/internal/asm/f64/dot.go new file mode 100644 index 00000000..09931644 --- /dev/null +++ b/vendor/gonum.org/v1/gonum/internal/asm/f64/dot.go @@ -0,0 +1,38 @@ +// Copyright ©2015 The Gonum Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +//go:build !amd64 || noasm || gccgo || safe +// +build !amd64 noasm gccgo safe + +package f64 + +// DotUnitary is +// +// for i, v := range x { +// sum += y[i] * v +// } +// return sum +func DotUnitary(x, y []float64) (sum float64) { + for i, v := range x { + sum += y[i] * v + } + return sum +} + +// DotInc is +// +// for i := 0; i < int(n); i++ { +// sum += y[iy] * x[ix] +// ix += incX +// iy += incY +// } +// return sum +func DotInc(x, y []float64, n, incX, incY, ix, iy uintptr) (sum float64) { + for i := 0; i < int(n); i++ { + sum += y[iy] * x[ix] + ix += incX + iy += incY + } + return sum +} diff --git a/vendor/gonum.org/v1/gonum/internal/asm/f64/dot_amd64.s b/vendor/gonum.org/v1/gonum/internal/asm/f64/dot_amd64.s new file mode 100644 index 00000000..c8cd4129 --- /dev/null +++ b/vendor/gonum.org/v1/gonum/internal/asm/f64/dot_amd64.s @@ -0,0 +1,145 @@ +// Copyright ©2015 The Gonum Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. +// +// Some of the loop unrolling code is copied from: +// http://golang.org/src/math/big/arith_amd64.s +// which is distributed under these terms: +// +// Copyright (c) 2012 The Go Authors. All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +// +build !noasm,!gccgo,!safe + +#include "textflag.h" + +// func DdotUnitary(x, y []float64) (sum float64) +// This function assumes len(y) >= len(x). +TEXT ·DotUnitary(SB), NOSPLIT, $0 + MOVQ x+0(FP), R8 + MOVQ x_len+8(FP), DI // n = len(x) + MOVQ y+24(FP), R9 + + MOVSD $(0.0), X7 // sum = 0 + MOVSD $(0.0), X8 // sum = 0 + + MOVQ $0, SI // i = 0 + SUBQ $4, DI // n -= 4 + JL tail_uni // if n < 0 goto tail_uni + +loop_uni: + // sum += x[i] * y[i] unrolled 4x. + MOVUPD 0(R8)(SI*8), X0 + MOVUPD 0(R9)(SI*8), X1 + MOVUPD 16(R8)(SI*8), X2 + MOVUPD 16(R9)(SI*8), X3 + MULPD X1, X0 + MULPD X3, X2 + ADDPD X0, X7 + ADDPD X2, X8 + + ADDQ $4, SI // i += 4 + SUBQ $4, DI // n -= 4 + JGE loop_uni // if n >= 0 goto loop_uni + +tail_uni: + ADDQ $4, DI // n += 4 + JLE end_uni // if n <= 0 goto end_uni + +onemore_uni: + // sum += x[i] * y[i] for the remaining 1-3 elements. + MOVSD 0(R8)(SI*8), X0 + MOVSD 0(R9)(SI*8), X1 + MULSD X1, X0 + ADDSD X0, X7 + + ADDQ $1, SI // i++ + SUBQ $1, DI // n-- + JNZ onemore_uni // if n != 0 goto onemore_uni + +end_uni: + // Add the four sums together. + ADDPD X8, X7 + MOVSD X7, X0 + UNPCKHPD X7, X7 + ADDSD X0, X7 + MOVSD X7, sum+48(FP) // Return final sum. + RET + +// func DdotInc(x, y []float64, n, incX, incY, ix, iy uintptr) (sum float64) +TEXT ·DotInc(SB), NOSPLIT, $0 + MOVQ x+0(FP), R8 + MOVQ y+24(FP), R9 + MOVQ n+48(FP), CX + MOVQ incX+56(FP), R11 + MOVQ incY+64(FP), R12 + MOVQ ix+72(FP), R13 + MOVQ iy+80(FP), R14 + + MOVSD $(0.0), X7 // sum = 0 + LEAQ (R8)(R13*8), SI // p = &x[ix] + LEAQ (R9)(R14*8), DI // q = &y[ix] + SHLQ $3, R11 // incX *= sizeof(float64) + SHLQ $3, R12 // indY *= sizeof(float64) + + SUBQ $2, CX // n -= 2 + JL tail_inc // if n < 0 goto tail_inc + +loop_inc: + // sum += *p * *q unrolled 2x. + MOVHPD (SI), X0 + MOVHPD (DI), X1 + ADDQ R11, SI // p += incX + ADDQ R12, DI // q += incY + MOVLPD (SI), X0 + MOVLPD (DI), X1 + ADDQ R11, SI // p += incX + ADDQ R12, DI // q += incY + + MULPD X1, X0 + ADDPD X0, X7 + + SUBQ $2, CX // n -= 2 + JGE loop_inc // if n >= 0 goto loop_inc + +tail_inc: + ADDQ $2, CX // n += 2 + JLE end_inc // if n <= 0 goto end_inc + + // sum += *p * *q for the last iteration if n is odd. + MOVSD (SI), X0 + MULSD (DI), X0 + ADDSD X0, X7 + +end_inc: + // Add the two sums together. + MOVSD X7, X0 + UNPCKHPD X7, X7 + ADDSD X0, X7 + MOVSD X7, sum+88(FP) // Return final sum. + RET diff --git a/vendor/gonum.org/v1/gonum/internal/asm/f64/ge_amd64.go b/vendor/gonum.org/v1/gonum/internal/asm/f64/ge_amd64.go new file mode 100644 index 00000000..5b042338 --- /dev/null +++ b/vendor/gonum.org/v1/gonum/internal/asm/f64/ge_amd64.go @@ -0,0 +1,29 @@ +// Copyright ©2017 The Gonum Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +//go:build !noasm && !gccgo && !safe +// +build !noasm,!gccgo,!safe + +package f64 + +// Ger performs the rank-one operation +// +// A += alpha * x * yᵀ +// +// where A is an m×n dense matrix, x and y are vectors, and alpha is a scalar. +func Ger(m, n uintptr, alpha float64, x []float64, incX uintptr, y []float64, incY uintptr, a []float64, lda uintptr) + +// GemvN computes +// +// y = alpha * A * x + beta * y +// +// where A is an m×n dense matrix, x and y are vectors, and alpha and beta are scalars. +func GemvN(m, n uintptr, alpha float64, a []float64, lda uintptr, x []float64, incX uintptr, beta float64, y []float64, incY uintptr) + +// GemvT computes +// +// y = alpha * Aᵀ * x + beta * y +// +// where A is an m×n dense matrix, x and y are vectors, and alpha and beta are scalars. +func GemvT(m, n uintptr, alpha float64, a []float64, lda uintptr, x []float64, incX uintptr, beta float64, y []float64, incY uintptr) diff --git a/vendor/gonum.org/v1/gonum/internal/asm/f64/ge_noasm.go b/vendor/gonum.org/v1/gonum/internal/asm/f64/ge_noasm.go new file mode 100644 index 00000000..e8dee051 --- /dev/null +++ b/vendor/gonum.org/v1/gonum/internal/asm/f64/ge_noasm.go @@ -0,0 +1,125 @@ +// Copyright ©2017 The Gonum Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +//go:build !amd64 || noasm || gccgo || safe +// +build !amd64 noasm gccgo safe + +package f64 + +// Ger performs the rank-one operation +// +// A += alpha * x * yᵀ +// +// where A is an m×n dense matrix, x and y are vectors, and alpha is a scalar. +func Ger(m, n uintptr, alpha float64, x []float64, incX uintptr, y []float64, incY uintptr, a []float64, lda uintptr) { + if incX == 1 && incY == 1 { + x = x[:m] + y = y[:n] + for i, xv := range x { + AxpyUnitary(alpha*xv, y, a[uintptr(i)*lda:uintptr(i)*lda+n]) + } + return + } + + var ky, kx uintptr + if int(incY) < 0 { + ky = uintptr(-int(n-1) * int(incY)) + } + if int(incX) < 0 { + kx = uintptr(-int(m-1) * int(incX)) + } + + ix := kx + for i := 0; i < int(m); i++ { + AxpyInc(alpha*x[ix], y, a[uintptr(i)*lda:uintptr(i)*lda+n], n, incY, 1, ky, 0) + ix += incX + } +} + +// GemvN computes +// +// y = alpha * A * x + beta * y +// +// where A is an m×n dense matrix, x and y are vectors, and alpha and beta are scalars. +func GemvN(m, n uintptr, alpha float64, a []float64, lda uintptr, x []float64, incX uintptr, beta float64, y []float64, incY uintptr) { + var kx, ky, i uintptr + if int(incX) < 0 { + kx = uintptr(-int(n-1) * int(incX)) + } + if int(incY) < 0 { + ky = uintptr(-int(m-1) * int(incY)) + } + + if incX == 1 && incY == 1 { + if beta == 0 { + for i = 0; i < m; i++ { + y[i] = alpha * DotUnitary(a[lda*i:lda*i+n], x) + } + return + } + for i = 0; i < m; i++ { + y[i] = y[i]*beta + alpha*DotUnitary(a[lda*i:lda*i+n], x) + } + return + } + iy := ky + if beta == 0 { + for i = 0; i < m; i++ { + y[iy] = alpha * DotInc(x, a[lda*i:lda*i+n], n, incX, 1, kx, 0) + iy += incY + } + return + } + for i = 0; i < m; i++ { + y[iy] = y[iy]*beta + alpha*DotInc(x, a[lda*i:lda*i+n], n, incX, 1, kx, 0) + iy += incY + } +} + +// GemvT computes +// +// y = alpha * Aᵀ * x + beta * y +// +// where A is an m×n dense matrix, x and y are vectors, and alpha and beta are scalars. +func GemvT(m, n uintptr, alpha float64, a []float64, lda uintptr, x []float64, incX uintptr, beta float64, y []float64, incY uintptr) { + var kx, ky, i uintptr + if int(incX) < 0 { + kx = uintptr(-int(m-1) * int(incX)) + } + if int(incY) < 0 { + ky = uintptr(-int(n-1) * int(incY)) + } + switch { + case beta == 0: // beta == 0 is special-cased to memclear + if incY == 1 { + for i := range y { + y[i] = 0 + } + } else { + iy := ky + for i := 0; i < int(n); i++ { + y[iy] = 0 + iy += incY + } + } + case int(incY) < 0: + ScalInc(beta, y, n, uintptr(int(-incY))) + case incY == 1: + ScalUnitary(beta, y[:n]) + default: + ScalInc(beta, y, n, incY) + } + + if incX == 1 && incY == 1 { + for i = 0; i < m; i++ { + AxpyUnitaryTo(y, alpha*x[i], a[lda*i:lda*i+n], y) + } + return + } + ix := kx + for i = 0; i < m; i++ { + AxpyInc(alpha*x[ix], a[lda*i:lda*i+n], y, n, 1, incY, 0, ky) + ix += incX + } +} diff --git a/vendor/gonum.org/v1/gonum/internal/asm/f64/gemvN_amd64.s b/vendor/gonum.org/v1/gonum/internal/asm/f64/gemvN_amd64.s new file mode 100644 index 00000000..917e0e30 --- /dev/null +++ b/vendor/gonum.org/v1/gonum/internal/asm/f64/gemvN_amd64.s @@ -0,0 +1,685 @@ +// Copyright ©2017 The Gonum Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// +build !noasm,!gccgo,!safe + +#include "textflag.h" + +#define SIZE 8 + +#define M_DIM m+0(FP) +#define M CX +#define N_DIM n+8(FP) +#define N BX + +#define TMP1 R14 +#define TMP2 R15 + +#define X_PTR SI +#define X x_base+56(FP) +#define INC_X R8 +#define INC3_X R9 + +#define Y_PTR DX +#define Y y_base+96(FP) +#define INC_Y R10 +#define INC3_Y R11 + +#define A_ROW AX +#define A_PTR DI +#define LDA R12 +#define LDA3 R13 + +#define ALPHA X15 +#define BETA X14 + +#define INIT4 \ + XORPS X0, X0 \ + XORPS X1, X1 \ + XORPS X2, X2 \ + XORPS X3, X3 + +#define INIT2 \ + XORPS X0, X0 \ + XORPS X1, X1 + +#define INIT1 \ + XORPS X0, X0 + +#define KERNEL_LOAD4 \ + MOVUPS (X_PTR), X12 \ + MOVUPS 2*SIZE(X_PTR), X13 + +#define KERNEL_LOAD2 \ + MOVUPS (X_PTR), X12 + +#define KERNEL_LOAD4_INC \ + MOVSD (X_PTR), X12 \ + MOVHPD (X_PTR)(INC_X*1), X12 \ + MOVSD (X_PTR)(INC_X*2), X13 \ + MOVHPD (X_PTR)(INC3_X*1), X13 + +#define KERNEL_LOAD2_INC \ + MOVSD (X_PTR), X12 \ + MOVHPD (X_PTR)(INC_X*1), X12 + +#define KERNEL_4x4 \ + MOVUPS (A_PTR), X4 \ + MOVUPS 2*SIZE(A_PTR), X5 \ + MOVUPS (A_PTR)(LDA*1), X6 \ + MOVUPS 2*SIZE(A_PTR)(LDA*1), X7 \ + MOVUPS (A_PTR)(LDA*2), X8 \ + MOVUPS 2*SIZE(A_PTR)(LDA*2), X9 \ + MOVUPS (A_PTR)(LDA3*1), X10 \ + MOVUPS 2*SIZE(A_PTR)(LDA3*1), X11 \ + MULPD X12, X4 \ + MULPD X13, X5 \ + MULPD X12, X6 \ + MULPD X13, X7 \ + MULPD X12, X8 \ + MULPD X13, X9 \ + MULPD X12, X10 \ + MULPD X13, X11 \ + ADDPD X4, X0 \ + ADDPD X5, X0 \ + ADDPD X6, X1 \ + ADDPD X7, X1 \ + ADDPD X8, X2 \ + ADDPD X9, X2 \ + ADDPD X10, X3 \ + ADDPD X11, X3 \ + ADDQ $4*SIZE, A_PTR + +#define KERNEL_4x2 \ + MOVUPS (A_PTR), X4 \ + MOVUPS (A_PTR)(LDA*1), X5 \ + MOVUPS (A_PTR)(LDA*2), X6 \ + MOVUPS (A_PTR)(LDA3*1), X7 \ + MULPD X12, X4 \ + MULPD X12, X5 \ + MULPD X12, X6 \ + MULPD X12, X7 \ + ADDPD X4, X0 \ + ADDPD X5, X1 \ + ADDPD X6, X2 \ + ADDPD X7, X3 \ + ADDQ $2*SIZE, A_PTR + +#define KERNEL_4x1 \ + MOVDDUP (X_PTR), X12 \ + MOVSD (A_PTR), X4 \ + MOVHPD (A_PTR)(LDA*1), X4 \ + MOVSD (A_PTR)(LDA*2), X5 \ + MOVHPD (A_PTR)(LDA3*1), X5 \ + MULPD X12, X4 \ + MULPD X12, X5 \ + ADDPD X4, X0 \ + ADDPD X5, X2 \ + ADDQ $SIZE, A_PTR + +#define STORE4 \ + MOVUPS (Y_PTR), X4 \ + MOVUPS 2*SIZE(Y_PTR), X5 \ + MULPD ALPHA, X0 \ + MULPD ALPHA, X2 \ + MULPD BETA, X4 \ + MULPD BETA, X5 \ + ADDPD X0, X4 \ + ADDPD X2, X5 \ + MOVUPS X4, (Y_PTR) \ + MOVUPS X5, 2*SIZE(Y_PTR) + +#define STORE4_INC \ + MOVSD (Y_PTR), X4 \ + MOVHPD (Y_PTR)(INC_Y*1), X4 \ + MOVSD (Y_PTR)(INC_Y*2), X5 \ + MOVHPD (Y_PTR)(INC3_Y*1), X5 \ + MULPD ALPHA, X0 \ + MULPD ALPHA, X2 \ + MULPD BETA, X4 \ + MULPD BETA, X5 \ + ADDPD X0, X4 \ + ADDPD X2, X5 \ + MOVLPD X4, (Y_PTR) \ + MOVHPD X4, (Y_PTR)(INC_Y*1) \ + MOVLPD X5, (Y_PTR)(INC_Y*2) \ + MOVHPD X5, (Y_PTR)(INC3_Y*1) + +#define KERNEL_2x4 \ + MOVUPS (A_PTR), X8 \ + MOVUPS 2*SIZE(A_PTR), X9 \ + MOVUPS (A_PTR)(LDA*1), X10 \ + MOVUPS 2*SIZE(A_PTR)(LDA*1), X11 \ + MULPD X12, X8 \ + MULPD X13, X9 \ + MULPD X12, X10 \ + MULPD X13, X11 \ + ADDPD X8, X0 \ + ADDPD X10, X1 \ + ADDPD X9, X0 \ + ADDPD X11, X1 \ + ADDQ $4*SIZE, A_PTR + +#define KERNEL_2x2 \ + MOVUPS (A_PTR), X8 \ + MOVUPS (A_PTR)(LDA*1), X9 \ + MULPD X12, X8 \ + MULPD X12, X9 \ + ADDPD X8, X0 \ + ADDPD X9, X1 \ + ADDQ $2*SIZE, A_PTR + +#define KERNEL_2x1 \ + MOVDDUP (X_PTR), X12 \ + MOVSD (A_PTR), X8 \ + MOVHPD (A_PTR)(LDA*1), X8 \ + MULPD X12, X8 \ + ADDPD X8, X0 \ + ADDQ $SIZE, A_PTR + +#define STORE2 \ + MOVUPS (Y_PTR), X4 \ + MULPD ALPHA, X0 \ + MULPD BETA, X4 \ + ADDPD X0, X4 \ + MOVUPS X4, (Y_PTR) + +#define STORE2_INC \ + MOVSD (Y_PTR), X4 \ + MOVHPD (Y_PTR)(INC_Y*1), X4 \ + MULPD ALPHA, X0 \ + MULPD BETA, X4 \ + ADDPD X0, X4 \ + MOVSD X4, (Y_PTR) \ + MOVHPD X4, (Y_PTR)(INC_Y*1) + +#define KERNEL_1x4 \ + MOVUPS (A_PTR), X8 \ + MOVUPS 2*SIZE(A_PTR), X9 \ + MULPD X12, X8 \ + MULPD X13, X9 \ + ADDPD X8, X0 \ + ADDPD X9, X0 \ + ADDQ $4*SIZE, A_PTR + +#define KERNEL_1x2 \ + MOVUPS (A_PTR), X8 \ + MULPD X12, X8 \ + ADDPD X8, X0 \ + ADDQ $2*SIZE, A_PTR + +#define KERNEL_1x1 \ + MOVSD (X_PTR), X12 \ + MOVSD (A_PTR), X8 \ + MULSD X12, X8 \ + ADDSD X8, X0 \ + ADDQ $SIZE, A_PTR + +#define STORE1 \ + HADDPD X0, X0 \ + MOVSD (Y_PTR), X4 \ + MULSD ALPHA, X0 \ + MULSD BETA, X4 \ + ADDSD X0, X4 \ + MOVSD X4, (Y_PTR) + +// func GemvN(m, n int, +// alpha float64, +// a []float64, lda int, +// x []float64, incX int, +// beta float64, +// y []float64, incY int) +TEXT ·GemvN(SB), NOSPLIT, $32-128 + MOVQ M_DIM, M + MOVQ N_DIM, N + CMPQ M, $0 + JE end + CMPQ N, $0 + JE end + + MOVDDUP alpha+16(FP), ALPHA + MOVDDUP beta+88(FP), BETA + + MOVQ x_base+56(FP), X_PTR + MOVQ y_base+96(FP), Y_PTR + MOVQ a_base+24(FP), A_ROW + MOVQ incY+120(FP), INC_Y + MOVQ lda+48(FP), LDA // LDA = LDA * sizeof(float64) + SHLQ $3, LDA + LEAQ (LDA)(LDA*2), LDA3 // LDA3 = LDA * 3 + MOVQ A_ROW, A_PTR + + XORQ TMP2, TMP2 + MOVQ M, TMP1 + SUBQ $1, TMP1 + IMULQ INC_Y, TMP1 + NEGQ TMP1 + CMPQ INC_Y, $0 + CMOVQLT TMP1, TMP2 + LEAQ (Y_PTR)(TMP2*SIZE), Y_PTR + MOVQ Y_PTR, Y + + SHLQ $3, INC_Y // INC_Y = incY * sizeof(float64) + LEAQ (INC_Y)(INC_Y*2), INC3_Y // INC3_Y = INC_Y * 3 + + MOVSD $0.0, X0 + COMISD BETA, X0 + JNE gemv_start // if beta != 0 { goto gemv_start } + +gemv_clear: // beta == 0 is special cased to clear memory (no nan handling) + XORPS X0, X0 + XORPS X1, X1 + XORPS X2, X2 + XORPS X3, X3 + + CMPQ incY+120(FP), $1 // Check for dense vector X (fast-path) + JNE inc_clear + + SHRQ $3, M + JZ clear4 + +clear8: + MOVUPS X0, (Y_PTR) + MOVUPS X1, 16(Y_PTR) + MOVUPS X2, 32(Y_PTR) + MOVUPS X3, 48(Y_PTR) + ADDQ $8*SIZE, Y_PTR + DECQ M + JNZ clear8 + +clear4: + TESTQ $4, M_DIM + JZ clear2 + MOVUPS X0, (Y_PTR) + MOVUPS X1, 16(Y_PTR) + ADDQ $4*SIZE, Y_PTR + +clear2: + TESTQ $2, M_DIM + JZ clear1 + MOVUPS X0, (Y_PTR) + ADDQ $2*SIZE, Y_PTR + +clear1: + TESTQ $1, M_DIM + JZ prep_end + MOVSD X0, (Y_PTR) + + JMP prep_end + +inc_clear: + SHRQ $2, M + JZ inc_clear2 + +inc_clear4: + MOVSD X0, (Y_PTR) + MOVSD X1, (Y_PTR)(INC_Y*1) + MOVSD X2, (Y_PTR)(INC_Y*2) + MOVSD X3, (Y_PTR)(INC3_Y*1) + LEAQ (Y_PTR)(INC_Y*4), Y_PTR + DECQ M + JNZ inc_clear4 + +inc_clear2: + TESTQ $2, M_DIM + JZ inc_clear1 + MOVSD X0, (Y_PTR) + MOVSD X1, (Y_PTR)(INC_Y*1) + LEAQ (Y_PTR)(INC_Y*2), Y_PTR + +inc_clear1: + TESTQ $1, M_DIM + JZ prep_end + MOVSD X0, (Y_PTR) + +prep_end: + MOVQ Y, Y_PTR + MOVQ M_DIM, M + +gemv_start: + CMPQ incX+80(FP), $1 // Check for dense vector X (fast-path) + JNE inc + + SHRQ $2, M + JZ r2 + +r4: + // LOAD 4 + INIT4 + + MOVQ N_DIM, N + SHRQ $2, N + JZ r4c2 + +r4c4: + // 4x4 KERNEL + KERNEL_LOAD4 + KERNEL_4x4 + + ADDQ $4*SIZE, X_PTR + + DECQ N + JNZ r4c4 + +r4c2: + TESTQ $2, N_DIM + JZ r4c1 + + // 4x2 KERNEL + KERNEL_LOAD2 + KERNEL_4x2 + + ADDQ $2*SIZE, X_PTR + +r4c1: + HADDPD X1, X0 + HADDPD X3, X2 + TESTQ $1, N_DIM + JZ r4end + + // 4x1 KERNEL + KERNEL_4x1 + + ADDQ $SIZE, X_PTR + +r4end: + CMPQ INC_Y, $SIZE + JNZ r4st_inc + + STORE4 + ADDQ $4*SIZE, Y_PTR + JMP r4inc + +r4st_inc: + STORE4_INC + LEAQ (Y_PTR)(INC_Y*4), Y_PTR + +r4inc: + MOVQ X, X_PTR + LEAQ (A_ROW)(LDA*4), A_ROW + MOVQ A_ROW, A_PTR + + DECQ M + JNZ r4 + +r2: + TESTQ $2, M_DIM + JZ r1 + + // LOAD 2 + INIT2 + + MOVQ N_DIM, N + SHRQ $2, N + JZ r2c2 + +r2c4: + // 2x4 KERNEL + KERNEL_LOAD4 + KERNEL_2x4 + + ADDQ $4*SIZE, X_PTR + + DECQ N + JNZ r2c4 + +r2c2: + TESTQ $2, N_DIM + JZ r2c1 + + // 2x2 KERNEL + KERNEL_LOAD2 + KERNEL_2x2 + + ADDQ $2*SIZE, X_PTR + +r2c1: + HADDPD X1, X0 + TESTQ $1, N_DIM + JZ r2end + + // 2x1 KERNEL + KERNEL_2x1 + + ADDQ $SIZE, X_PTR + +r2end: + CMPQ INC_Y, $SIZE + JNE r2st_inc + + STORE2 + ADDQ $2*SIZE, Y_PTR + JMP r2inc + +r2st_inc: + STORE2_INC + LEAQ (Y_PTR)(INC_Y*2), Y_PTR + +r2inc: + MOVQ X, X_PTR + LEAQ (A_ROW)(LDA*2), A_ROW + MOVQ A_ROW, A_PTR + +r1: + TESTQ $1, M_DIM + JZ end + + // LOAD 1 + INIT1 + + MOVQ N_DIM, N + SHRQ $2, N + JZ r1c2 + +r1c4: + // 1x4 KERNEL + KERNEL_LOAD4 + KERNEL_1x4 + + ADDQ $4*SIZE, X_PTR + + DECQ N + JNZ r1c4 + +r1c2: + TESTQ $2, N_DIM + JZ r1c1 + + // 1x2 KERNEL + KERNEL_LOAD2 + KERNEL_1x2 + + ADDQ $2*SIZE, X_PTR + +r1c1: + + TESTQ $1, N_DIM + JZ r1end + + // 1x1 KERNEL + KERNEL_1x1 + +r1end: + STORE1 + +end: + RET + +inc: // Algorithm for incX != 1 ( split loads in kernel ) + MOVQ incX+80(FP), INC_X // INC_X = incX + + XORQ TMP2, TMP2 // TMP2 = 0 + MOVQ N, TMP1 // TMP1 = N + SUBQ $1, TMP1 // TMP1 -= 1 + NEGQ TMP1 // TMP1 = -TMP1 + IMULQ INC_X, TMP1 // TMP1 *= INC_X + CMPQ INC_X, $0 // if INC_X < 0 { TMP2 = TMP1 } + CMOVQLT TMP1, TMP2 + LEAQ (X_PTR)(TMP2*SIZE), X_PTR // X_PTR = X_PTR[TMP2] + MOVQ X_PTR, X // X = X_PTR + + SHLQ $3, INC_X + LEAQ (INC_X)(INC_X*2), INC3_X // INC3_X = INC_X * 3 + + SHRQ $2, M + JZ inc_r2 + +inc_r4: + // LOAD 4 + INIT4 + + MOVQ N_DIM, N + SHRQ $2, N + JZ inc_r4c2 + +inc_r4c4: + // 4x4 KERNEL + KERNEL_LOAD4_INC + KERNEL_4x4 + + LEAQ (X_PTR)(INC_X*4), X_PTR + + DECQ N + JNZ inc_r4c4 + +inc_r4c2: + TESTQ $2, N_DIM + JZ inc_r4c1 + + // 4x2 KERNEL + KERNEL_LOAD2_INC + KERNEL_4x2 + + LEAQ (X_PTR)(INC_X*2), X_PTR + +inc_r4c1: + HADDPD X1, X0 + HADDPD X3, X2 + TESTQ $1, N_DIM + JZ inc_r4end + + // 4x1 KERNEL + KERNEL_4x1 + + ADDQ INC_X, X_PTR + +inc_r4end: + CMPQ INC_Y, $SIZE + JNE inc_r4st_inc + + STORE4 + ADDQ $4*SIZE, Y_PTR + JMP inc_r4inc + +inc_r4st_inc: + STORE4_INC + LEAQ (Y_PTR)(INC_Y*4), Y_PTR + +inc_r4inc: + MOVQ X, X_PTR + LEAQ (A_ROW)(LDA*4), A_ROW + MOVQ A_ROW, A_PTR + + DECQ M + JNZ inc_r4 + +inc_r2: + TESTQ $2, M_DIM + JZ inc_r1 + + // LOAD 2 + INIT2 + + MOVQ N_DIM, N + SHRQ $2, N + JZ inc_r2c2 + +inc_r2c4: + // 2x4 KERNEL + KERNEL_LOAD4_INC + KERNEL_2x4 + + LEAQ (X_PTR)(INC_X*4), X_PTR + DECQ N + JNZ inc_r2c4 + +inc_r2c2: + TESTQ $2, N_DIM + JZ inc_r2c1 + + // 2x2 KERNEL + KERNEL_LOAD2_INC + KERNEL_2x2 + + LEAQ (X_PTR)(INC_X*2), X_PTR + +inc_r2c1: + HADDPD X1, X0 + TESTQ $1, N_DIM + JZ inc_r2end + + // 2x1 KERNEL + KERNEL_2x1 + + ADDQ INC_X, X_PTR + +inc_r2end: + CMPQ INC_Y, $SIZE + JNE inc_r2st_inc + + STORE2 + ADDQ $2*SIZE, Y_PTR + JMP inc_r2inc + +inc_r2st_inc: + STORE2_INC + LEAQ (Y_PTR)(INC_Y*2), Y_PTR + +inc_r2inc: + MOVQ X, X_PTR + LEAQ (A_ROW)(LDA*2), A_ROW + MOVQ A_ROW, A_PTR + +inc_r1: + TESTQ $1, M_DIM + JZ inc_end + + // LOAD 1 + INIT1 + + MOVQ N_DIM, N + SHRQ $2, N + JZ inc_r1c2 + +inc_r1c4: + // 1x4 KERNEL + KERNEL_LOAD4_INC + KERNEL_1x4 + + LEAQ (X_PTR)(INC_X*4), X_PTR + DECQ N + JNZ inc_r1c4 + +inc_r1c2: + TESTQ $2, N_DIM + JZ inc_r1c1 + + // 1x2 KERNEL + KERNEL_LOAD2_INC + KERNEL_1x2 + + LEAQ (X_PTR)(INC_X*2), X_PTR + +inc_r1c1: + TESTQ $1, N_DIM + JZ inc_r1end + + // 1x1 KERNEL + KERNEL_1x1 + +inc_r1end: + STORE1 + +inc_end: + RET diff --git a/vendor/gonum.org/v1/gonum/internal/asm/f64/gemvT_amd64.s b/vendor/gonum.org/v1/gonum/internal/asm/f64/gemvT_amd64.s new file mode 100644 index 00000000..04071000 --- /dev/null +++ b/vendor/gonum.org/v1/gonum/internal/asm/f64/gemvT_amd64.s @@ -0,0 +1,745 @@ +// Copyright ©2017 The Gonum Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// +build !noasm,!gccgo,!safe + +#include "textflag.h" + +#define SIZE 8 + +#define M_DIM n+8(FP) +#define M CX +#define N_DIM m+0(FP) +#define N BX + +#define TMP1 R14 +#define TMP2 R15 + +#define X_PTR SI +#define X x_base+56(FP) +#define Y_PTR DX +#define Y y_base+96(FP) +#define A_ROW AX +#define A_PTR DI + +#define INC_X R8 +#define INC3_X R9 + +#define INC_Y R10 +#define INC3_Y R11 + +#define LDA R12 +#define LDA3 R13 + +#define ALPHA X15 +#define BETA X14 + +#define INIT4 \ + MOVDDUP (X_PTR), X8 \ + MOVDDUP (X_PTR)(INC_X*1), X9 \ + MOVDDUP (X_PTR)(INC_X*2), X10 \ + MOVDDUP (X_PTR)(INC3_X*1), X11 \ + MULPD ALPHA, X8 \ + MULPD ALPHA, X9 \ + MULPD ALPHA, X10 \ + MULPD ALPHA, X11 + +#define INIT2 \ + MOVDDUP (X_PTR), X8 \ + MOVDDUP (X_PTR)(INC_X*1), X9 \ + MULPD ALPHA, X8 \ + MULPD ALPHA, X9 + +#define INIT1 \ + MOVDDUP (X_PTR), X8 \ + MULPD ALPHA, X8 + +#define KERNEL_LOAD4 \ + MOVUPS (Y_PTR), X0 \ + MOVUPS 2*SIZE(Y_PTR), X1 + +#define KERNEL_LOAD2 \ + MOVUPS (Y_PTR), X0 + +#define KERNEL_LOAD4_INC \ + MOVSD (Y_PTR), X0 \ + MOVHPD (Y_PTR)(INC_Y*1), X0 \ + MOVSD (Y_PTR)(INC_Y*2), X1 \ + MOVHPD (Y_PTR)(INC3_Y*1), X1 + +#define KERNEL_LOAD2_INC \ + MOVSD (Y_PTR), X0 \ + MOVHPD (Y_PTR)(INC_Y*1), X0 + +#define KERNEL_4x4 \ + MOVUPS (A_PTR), X4 \ + MOVUPS 2*SIZE(A_PTR), X5 \ + MOVUPS (A_PTR)(LDA*1), X6 \ + MOVUPS 2*SIZE(A_PTR)(LDA*1), X7 \ + MULPD X8, X4 \ + MULPD X8, X5 \ + MULPD X9, X6 \ + MULPD X9, X7 \ + ADDPD X4, X0 \ + ADDPD X5, X1 \ + ADDPD X6, X0 \ + ADDPD X7, X1 \ + MOVUPS (A_PTR)(LDA*2), X4 \ + MOVUPS 2*SIZE(A_PTR)(LDA*2), X5 \ + MOVUPS (A_PTR)(LDA3*1), X6 \ + MOVUPS 2*SIZE(A_PTR)(LDA3*1), X7 \ + MULPD X10, X4 \ + MULPD X10, X5 \ + MULPD X11, X6 \ + MULPD X11, X7 \ + ADDPD X4, X0 \ + ADDPD X5, X1 \ + ADDPD X6, X0 \ + ADDPD X7, X1 \ + ADDQ $4*SIZE, A_PTR + +#define KERNEL_4x2 \ + MOVUPS (A_PTR), X4 \ + MOVUPS 2*SIZE(A_PTR), X5 \ + MOVUPS (A_PTR)(LDA*1), X6 \ + MOVUPS 2*SIZE(A_PTR)(LDA*1), X7 \ + MULPD X8, X4 \ + MULPD X8, X5 \ + MULPD X9, X6 \ + MULPD X9, X7 \ + ADDPD X4, X0 \ + ADDPD X5, X1 \ + ADDPD X6, X0 \ + ADDPD X7, X1 \ + ADDQ $4*SIZE, A_PTR + +#define KERNEL_4x1 \ + MOVUPS (A_PTR), X4 \ + MOVUPS 2*SIZE(A_PTR), X5 \ + MULPD X8, X4 \ + MULPD X8, X5 \ + ADDPD X4, X0 \ + ADDPD X5, X1 \ + ADDQ $4*SIZE, A_PTR + +#define STORE4 \ + MOVUPS X0, (Y_PTR) \ + MOVUPS X1, 2*SIZE(Y_PTR) + +#define STORE4_INC \ + MOVLPD X0, (Y_PTR) \ + MOVHPD X0, (Y_PTR)(INC_Y*1) \ + MOVLPD X1, (Y_PTR)(INC_Y*2) \ + MOVHPD X1, (Y_PTR)(INC3_Y*1) + +#define KERNEL_2x4 \ + MOVUPS (A_PTR), X4 \ + MOVUPS (A_PTR)(LDA*1), X5 \ + MOVUPS (A_PTR)(LDA*2), X6 \ + MOVUPS (A_PTR)(LDA3*1), X7 \ + MULPD X8, X4 \ + MULPD X9, X5 \ + MULPD X10, X6 \ + MULPD X11, X7 \ + ADDPD X4, X0 \ + ADDPD X5, X0 \ + ADDPD X6, X0 \ + ADDPD X7, X0 \ + ADDQ $2*SIZE, A_PTR + +#define KERNEL_2x2 \ + MOVUPS (A_PTR), X4 \ + MOVUPS (A_PTR)(LDA*1), X5 \ + MULPD X8, X4 \ + MULPD X9, X5 \ + ADDPD X4, X0 \ + ADDPD X5, X0 \ + ADDQ $2*SIZE, A_PTR + +#define KERNEL_2x1 \ + MOVUPS (A_PTR), X4 \ + MULPD X8, X4 \ + ADDPD X4, X0 \ + ADDQ $2*SIZE, A_PTR + +#define STORE2 \ + MOVUPS X0, (Y_PTR) + +#define STORE2_INC \ + MOVLPD X0, (Y_PTR) \ + MOVHPD X0, (Y_PTR)(INC_Y*1) + +#define KERNEL_1x4 \ + MOVSD (Y_PTR), X0 \ + MOVSD (A_PTR), X4 \ + MOVSD (A_PTR)(LDA*1), X5 \ + MOVSD (A_PTR)(LDA*2), X6 \ + MOVSD (A_PTR)(LDA3*1), X7 \ + MULSD X8, X4 \ + MULSD X9, X5 \ + MULSD X10, X6 \ + MULSD X11, X7 \ + ADDSD X4, X0 \ + ADDSD X5, X0 \ + ADDSD X6, X0 \ + ADDSD X7, X0 \ + MOVSD X0, (Y_PTR) \ + ADDQ $SIZE, A_PTR + +#define KERNEL_1x2 \ + MOVSD (Y_PTR), X0 \ + MOVSD (A_PTR), X4 \ + MOVSD (A_PTR)(LDA*1), X5 \ + MULSD X8, X4 \ + MULSD X9, X5 \ + ADDSD X4, X0 \ + ADDSD X5, X0 \ + MOVSD X0, (Y_PTR) \ + ADDQ $SIZE, A_PTR + +#define KERNEL_1x1 \ + MOVSD (Y_PTR), X0 \ + MOVSD (A_PTR), X4 \ + MULSD X8, X4 \ + ADDSD X4, X0 \ + MOVSD X0, (Y_PTR) \ + ADDQ $SIZE, A_PTR + +#define SCALE_8(PTR, SCAL) \ + MOVUPS (PTR), X0 \ + MOVUPS 16(PTR), X1 \ + MOVUPS 32(PTR), X2 \ + MOVUPS 48(PTR), X3 \ + MULPD SCAL, X0 \ + MULPD SCAL, X1 \ + MULPD SCAL, X2 \ + MULPD SCAL, X3 \ + MOVUPS X0, (PTR) \ + MOVUPS X1, 16(PTR) \ + MOVUPS X2, 32(PTR) \ + MOVUPS X3, 48(PTR) + +#define SCALE_4(PTR, SCAL) \ + MOVUPS (PTR), X0 \ + MOVUPS 16(PTR), X1 \ + MULPD SCAL, X0 \ + MULPD SCAL, X1 \ + MOVUPS X0, (PTR) \ + MOVUPS X1, 16(PTR) \ + +#define SCALE_2(PTR, SCAL) \ + MOVUPS (PTR), X0 \ + MULPD SCAL, X0 \ + MOVUPS X0, (PTR) \ + +#define SCALE_1(PTR, SCAL) \ + MOVSD (PTR), X0 \ + MULSD SCAL, X0 \ + MOVSD X0, (PTR) \ + +#define SCALEINC_4(PTR, INC, INC3, SCAL) \ + MOVSD (PTR), X0 \ + MOVSD (PTR)(INC*1), X1 \ + MOVSD (PTR)(INC*2), X2 \ + MOVSD (PTR)(INC3*1), X3 \ + MULSD SCAL, X0 \ + MULSD SCAL, X1 \ + MULSD SCAL, X2 \ + MULSD SCAL, X3 \ + MOVSD X0, (PTR) \ + MOVSD X1, (PTR)(INC*1) \ + MOVSD X2, (PTR)(INC*2) \ + MOVSD X3, (PTR)(INC3*1) + +#define SCALEINC_2(PTR, INC, SCAL) \ + MOVSD (PTR), X0 \ + MOVSD (PTR)(INC*1), X1 \ + MULSD SCAL, X0 \ + MULSD SCAL, X1 \ + MOVSD X0, (PTR) \ + MOVSD X1, (PTR)(INC*1) + +// func GemvT(m, n int, +// alpha float64, +// a []float64, lda int, +// x []float64, incX int, +// beta float64, +// y []float64, incY int) +TEXT ·GemvT(SB), NOSPLIT, $32-128 + MOVQ M_DIM, M + MOVQ N_DIM, N + CMPQ M, $0 + JE end + CMPQ N, $0 + JE end + + MOVDDUP alpha+16(FP), ALPHA + + MOVQ x_base+56(FP), X_PTR + MOVQ y_base+96(FP), Y_PTR + MOVQ a_base+24(FP), A_ROW + MOVQ incY+120(FP), INC_Y // INC_Y = incY * sizeof(float64) + MOVQ lda+48(FP), LDA // LDA = LDA * sizeof(float64) + SHLQ $3, LDA + LEAQ (LDA)(LDA*2), LDA3 // LDA3 = LDA * 3 + MOVQ A_ROW, A_PTR + + MOVQ incX+80(FP), INC_X // INC_X = incX * sizeof(float64) + + XORQ TMP2, TMP2 + MOVQ N, TMP1 + SUBQ $1, TMP1 + NEGQ TMP1 + IMULQ INC_X, TMP1 + CMPQ INC_X, $0 + CMOVQLT TMP1, TMP2 + LEAQ (X_PTR)(TMP2*SIZE), X_PTR + MOVQ X_PTR, X + + SHLQ $3, INC_X + LEAQ (INC_X)(INC_X*2), INC3_X // INC3_X = INC_X * 3 + + CMPQ incY+120(FP), $1 // Check for dense vector Y (fast-path) + JNE inc + + MOVSD $1.0, X0 + COMISD beta+88(FP), X0 + JE gemv_start + + MOVSD $0.0, X0 + COMISD beta+88(FP), X0 + JE gemv_clear + + MOVDDUP beta+88(FP), BETA + SHRQ $3, M + JZ scal4 + +scal8: + SCALE_8(Y_PTR, BETA) + ADDQ $8*SIZE, Y_PTR + DECQ M + JNZ scal8 + +scal4: + TESTQ $4, M_DIM + JZ scal2 + SCALE_4(Y_PTR, BETA) + ADDQ $4*SIZE, Y_PTR + +scal2: + TESTQ $2, M_DIM + JZ scal1 + SCALE_2(Y_PTR, BETA) + ADDQ $2*SIZE, Y_PTR + +scal1: + TESTQ $1, M_DIM + JZ prep_end + SCALE_1(Y_PTR, BETA) + + JMP prep_end + +gemv_clear: // beta == 0 is special cased to clear memory (no nan handling) + XORPS X0, X0 + XORPS X1, X1 + XORPS X2, X2 + XORPS X3, X3 + + SHRQ $3, M + JZ clear4 + +clear8: + MOVUPS X0, (Y_PTR) + MOVUPS X1, 16(Y_PTR) + MOVUPS X2, 32(Y_PTR) + MOVUPS X3, 48(Y_PTR) + ADDQ $8*SIZE, Y_PTR + DECQ M + JNZ clear8 + +clear4: + TESTQ $4, M_DIM + JZ clear2 + MOVUPS X0, (Y_PTR) + MOVUPS X1, 16(Y_PTR) + ADDQ $4*SIZE, Y_PTR + +clear2: + TESTQ $2, M_DIM + JZ clear1 + MOVUPS X0, (Y_PTR) + ADDQ $2*SIZE, Y_PTR + +clear1: + TESTQ $1, M_DIM + JZ prep_end + MOVSD X0, (Y_PTR) + +prep_end: + MOVQ Y, Y_PTR + MOVQ M_DIM, M + +gemv_start: + SHRQ $2, N + JZ c2 + +c4: + // LOAD 4 + INIT4 + + MOVQ M_DIM, M + SHRQ $2, M + JZ c4r2 + +c4r4: + // 4x4 KERNEL + KERNEL_LOAD4 + KERNEL_4x4 + STORE4 + + ADDQ $4*SIZE, Y_PTR + + DECQ M + JNZ c4r4 + +c4r2: + TESTQ $2, M_DIM + JZ c4r1 + + // 4x2 KERNEL + KERNEL_LOAD2 + KERNEL_2x4 + STORE2 + + ADDQ $2*SIZE, Y_PTR + +c4r1: + TESTQ $1, M_DIM + JZ c4end + + // 4x1 KERNEL + KERNEL_1x4 + + ADDQ $SIZE, Y_PTR + +c4end: + LEAQ (X_PTR)(INC_X*4), X_PTR + MOVQ Y, Y_PTR + LEAQ (A_ROW)(LDA*4), A_ROW + MOVQ A_ROW, A_PTR + + DECQ N + JNZ c4 + +c2: + TESTQ $2, N_DIM + JZ c1 + + // LOAD 2 + INIT2 + + MOVQ M_DIM, M + SHRQ $2, M + JZ c2r2 + +c2r4: + // 2x4 KERNEL + KERNEL_LOAD4 + KERNEL_4x2 + STORE4 + + ADDQ $4*SIZE, Y_PTR + + DECQ M + JNZ c2r4 + +c2r2: + TESTQ $2, M_DIM + JZ c2r1 + + // 2x2 KERNEL + KERNEL_LOAD2 + KERNEL_2x2 + STORE2 + + ADDQ $2*SIZE, Y_PTR + +c2r1: + TESTQ $1, M_DIM + JZ c2end + + // 2x1 KERNEL + KERNEL_1x2 + + ADDQ $SIZE, Y_PTR + +c2end: + LEAQ (X_PTR)(INC_X*2), X_PTR + MOVQ Y, Y_PTR + LEAQ (A_ROW)(LDA*2), A_ROW + MOVQ A_ROW, A_PTR + +c1: + TESTQ $1, N_DIM + JZ end + + // LOAD 1 + INIT1 + + MOVQ M_DIM, M + SHRQ $2, M + JZ c1r2 + +c1r4: + // 1x4 KERNEL + KERNEL_LOAD4 + KERNEL_4x1 + STORE4 + + ADDQ $4*SIZE, Y_PTR + + DECQ M + JNZ c1r4 + +c1r2: + TESTQ $2, M_DIM + JZ c1r1 + + // 1x2 KERNEL + KERNEL_LOAD2 + KERNEL_2x1 + STORE2 + + ADDQ $2*SIZE, Y_PTR + +c1r1: + TESTQ $1, M_DIM + JZ end + + // 1x1 KERNEL + KERNEL_1x1 + +end: + RET + +inc: // Algorithm for incX != 0 ( split loads in kernel ) + XORQ TMP2, TMP2 + MOVQ M, TMP1 + SUBQ $1, TMP1 + IMULQ INC_Y, TMP1 + NEGQ TMP1 + CMPQ INC_Y, $0 + CMOVQLT TMP1, TMP2 + LEAQ (Y_PTR)(TMP2*SIZE), Y_PTR + MOVQ Y_PTR, Y + + SHLQ $3, INC_Y + LEAQ (INC_Y)(INC_Y*2), INC3_Y // INC3_Y = INC_Y * 3 + + MOVSD $1.0, X0 + COMISD beta+88(FP), X0 + JE inc_gemv_start + + MOVSD $0.0, X0 + COMISD beta+88(FP), X0 + JE inc_gemv_clear + + MOVDDUP beta+88(FP), BETA + SHRQ $2, M + JZ inc_scal2 + +inc_scal4: + SCALEINC_4(Y_PTR, INC_Y, INC3_Y, BETA) + LEAQ (Y_PTR)(INC_Y*4), Y_PTR + DECQ M + JNZ inc_scal4 + +inc_scal2: + TESTQ $2, M_DIM + JZ inc_scal1 + + SCALEINC_2(Y_PTR, INC_Y, BETA) + LEAQ (Y_PTR)(INC_Y*2), Y_PTR + +inc_scal1: + TESTQ $1, M_DIM + JZ inc_prep_end + SCALE_1(Y_PTR, BETA) + + JMP inc_prep_end + +inc_gemv_clear: // beta == 0 is special-cased to clear memory (no nan handling) + XORPS X0, X0 + XORPS X1, X1 + XORPS X2, X2 + XORPS X3, X3 + + SHRQ $2, M + JZ inc_clear2 + +inc_clear4: + MOVSD X0, (Y_PTR) + MOVSD X1, (Y_PTR)(INC_Y*1) + MOVSD X2, (Y_PTR)(INC_Y*2) + MOVSD X3, (Y_PTR)(INC3_Y*1) + LEAQ (Y_PTR)(INC_Y*4), Y_PTR + DECQ M + JNZ inc_clear4 + +inc_clear2: + TESTQ $2, M_DIM + JZ inc_clear1 + MOVSD X0, (Y_PTR) + MOVSD X1, (Y_PTR)(INC_Y*1) + LEAQ (Y_PTR)(INC_Y*2), Y_PTR + +inc_clear1: + TESTQ $1, M_DIM + JZ inc_prep_end + MOVSD X0, (Y_PTR) + +inc_prep_end: + MOVQ Y, Y_PTR + MOVQ M_DIM, M + +inc_gemv_start: + SHRQ $2, N + JZ inc_c2 + +inc_c4: + // LOAD 4 + INIT4 + + MOVQ M_DIM, M + SHRQ $2, M + JZ inc_c4r2 + +inc_c4r4: + // 4x4 KERNEL + KERNEL_LOAD4_INC + KERNEL_4x4 + STORE4_INC + + LEAQ (Y_PTR)(INC_Y*4), Y_PTR + + DECQ M + JNZ inc_c4r4 + +inc_c4r2: + TESTQ $2, M_DIM + JZ inc_c4r1 + + // 4x2 KERNEL + KERNEL_LOAD2_INC + KERNEL_2x4 + STORE2_INC + + LEAQ (Y_PTR)(INC_Y*2), Y_PTR + +inc_c4r1: + TESTQ $1, M_DIM + JZ inc_c4end + + // 4x1 KERNEL + KERNEL_1x4 + + ADDQ INC_Y, Y_PTR + +inc_c4end: + LEAQ (X_PTR)(INC_X*4), X_PTR + MOVQ Y, Y_PTR + LEAQ (A_ROW)(LDA*4), A_ROW + MOVQ A_ROW, A_PTR + + DECQ N + JNZ inc_c4 + +inc_c2: + TESTQ $2, N_DIM + JZ inc_c1 + + // LOAD 2 + INIT2 + + MOVQ M_DIM, M + SHRQ $2, M + JZ inc_c2r2 + +inc_c2r4: + // 2x4 KERNEL + KERNEL_LOAD4_INC + KERNEL_4x2 + STORE4_INC + + LEAQ (Y_PTR)(INC_Y*4), Y_PTR + DECQ M + JNZ inc_c2r4 + +inc_c2r2: + TESTQ $2, M_DIM + JZ inc_c2r1 + + // 2x2 KERNEL + KERNEL_LOAD2_INC + KERNEL_2x2 + STORE2_INC + + LEAQ (Y_PTR)(INC_Y*2), Y_PTR + +inc_c2r1: + TESTQ $1, M_DIM + JZ inc_c2end + + // 2x1 KERNEL + KERNEL_1x2 + + ADDQ INC_Y, Y_PTR + +inc_c2end: + LEAQ (X_PTR)(INC_X*2), X_PTR + MOVQ Y, Y_PTR + LEAQ (A_ROW)(LDA*2), A_ROW + MOVQ A_ROW, A_PTR + +inc_c1: + TESTQ $1, N_DIM + JZ inc_end + + // LOAD 1 + INIT1 + + MOVQ M_DIM, M + SHRQ $2, M + JZ inc_c1r2 + +inc_c1r4: + // 1x4 KERNEL + KERNEL_LOAD4_INC + KERNEL_4x1 + STORE4_INC + + LEAQ (Y_PTR)(INC_Y*4), Y_PTR + DECQ M + JNZ inc_c1r4 + +inc_c1r2: + TESTQ $2, M_DIM + JZ inc_c1r1 + + // 1x2 KERNEL + KERNEL_LOAD2_INC + KERNEL_2x1 + STORE2_INC + + LEAQ (Y_PTR)(INC_Y*2), Y_PTR + +inc_c1r1: + TESTQ $1, M_DIM + JZ inc_end + + // 1x1 KERNEL + KERNEL_1x1 + +inc_end: + RET diff --git a/vendor/gonum.org/v1/gonum/internal/asm/f64/ger_amd64.s b/vendor/gonum.org/v1/gonum/internal/asm/f64/ger_amd64.s new file mode 100644 index 00000000..8cae5691 --- /dev/null +++ b/vendor/gonum.org/v1/gonum/internal/asm/f64/ger_amd64.s @@ -0,0 +1,591 @@ +// Copyright ©2017 The Gonum Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// +build !noasm,!gccgo,!safe + +#include "textflag.h" + +#define SIZE 8 + +#define M_DIM m+0(FP) +#define M CX +#define N_DIM n+8(FP) +#define N BX + +#define TMP1 R14 +#define TMP2 R15 + +#define X_PTR SI +#define Y y_base+56(FP) +#define Y_PTR DX +#define A_ROW AX +#define A_PTR DI + +#define INC_X R8 +#define INC3_X R9 + +#define INC_Y R10 +#define INC3_Y R11 + +#define LDA R12 +#define LDA3 R13 + +#define ALPHA X0 + +#define LOAD4 \ + PREFETCHNTA (X_PTR )(INC_X*8) \ + MOVDDUP (X_PTR), X1 \ + MOVDDUP (X_PTR)(INC_X*1), X2 \ + MOVDDUP (X_PTR)(INC_X*2), X3 \ + MOVDDUP (X_PTR)(INC3_X*1), X4 \ + MULPD ALPHA, X1 \ + MULPD ALPHA, X2 \ + MULPD ALPHA, X3 \ + MULPD ALPHA, X4 + +#define LOAD2 \ + MOVDDUP (X_PTR), X1 \ + MOVDDUP (X_PTR)(INC_X*1), X2 \ + MULPD ALPHA, X1 \ + MULPD ALPHA, X2 + +#define LOAD1 \ + MOVDDUP (X_PTR), X1 \ + MULPD ALPHA, X1 + +#define KERNEL_LOAD4 \ + MOVUPS (Y_PTR), X5 \ + MOVUPS 2*SIZE(Y_PTR), X6 + +#define KERNEL_LOAD4_INC \ + MOVLPD (Y_PTR), X5 \ + MOVHPD (Y_PTR)(INC_Y*1), X5 \ + MOVLPD (Y_PTR)(INC_Y*2), X6 \ + MOVHPD (Y_PTR)(INC3_Y*1), X6 + +#define KERNEL_LOAD2 \ + MOVUPS (Y_PTR), X5 + +#define KERNEL_LOAD2_INC \ + MOVLPD (Y_PTR), X5 \ + MOVHPD (Y_PTR)(INC_Y*1), X5 + +#define KERNEL_4x4 \ + MOVUPS X5, X7 \ + MOVUPS X6, X8 \ + MOVUPS X5, X9 \ + MOVUPS X6, X10 \ + MOVUPS X5, X11 \ + MOVUPS X6, X12 \ + MULPD X1, X5 \ + MULPD X1, X6 \ + MULPD X2, X7 \ + MULPD X2, X8 \ + MULPD X3, X9 \ + MULPD X3, X10 \ + MULPD X4, X11 \ + MULPD X4, X12 + +#define STORE_4x4 \ + MOVUPS (A_PTR), X13 \ + ADDPD X13, X5 \ + MOVUPS 2*SIZE(A_PTR), X14 \ + ADDPD X14, X6 \ + MOVUPS (A_PTR)(LDA*1), X15 \ + ADDPD X15, X7 \ + MOVUPS 2*SIZE(A_PTR)(LDA*1), X0 \ + ADDPD X0, X8 \ + MOVUPS (A_PTR)(LDA*2), X13 \ + ADDPD X13, X9 \ + MOVUPS 2*SIZE(A_PTR)(LDA*2), X14 \ + ADDPD X14, X10 \ + MOVUPS (A_PTR)(LDA3*1), X15 \ + ADDPD X15, X11 \ + MOVUPS 2*SIZE(A_PTR)(LDA3*1), X0 \ + ADDPD X0, X12 \ + MOVUPS X5, (A_PTR) \ + MOVUPS X6, 2*SIZE(A_PTR) \ + MOVUPS X7, (A_PTR)(LDA*1) \ + MOVUPS X8, 2*SIZE(A_PTR)(LDA*1) \ + MOVUPS X9, (A_PTR)(LDA*2) \ + MOVUPS X10, 2*SIZE(A_PTR)(LDA*2) \ + MOVUPS X11, (A_PTR)(LDA3*1) \ + MOVUPS X12, 2*SIZE(A_PTR)(LDA3*1) \ + ADDQ $4*SIZE, A_PTR + +#define KERNEL_4x2 \ + MOVUPS X5, X6 \ + MOVUPS X5, X7 \ + MOVUPS X5, X8 \ + MULPD X1, X5 \ + MULPD X2, X6 \ + MULPD X3, X7 \ + MULPD X4, X8 + +#define STORE_4x2 \ + MOVUPS (A_PTR), X9 \ + ADDPD X9, X5 \ + MOVUPS (A_PTR)(LDA*1), X10 \ + ADDPD X10, X6 \ + MOVUPS (A_PTR)(LDA*2), X11 \ + ADDPD X11, X7 \ + MOVUPS (A_PTR)(LDA3*1), X12 \ + ADDPD X12, X8 \ + MOVUPS X5, (A_PTR) \ + MOVUPS X6, (A_PTR)(LDA*1) \ + MOVUPS X7, (A_PTR)(LDA*2) \ + MOVUPS X8, (A_PTR)(LDA3*1) \ + ADDQ $2*SIZE, A_PTR + +#define KERNEL_4x1 \ + MOVSD (Y_PTR), X5 \ + MOVSD X5, X6 \ + MOVSD X5, X7 \ + MOVSD X5, X8 \ + MULSD X1, X5 \ + MULSD X2, X6 \ + MULSD X3, X7 \ + MULSD X4, X8 + +#define STORE_4x1 \ + ADDSD (A_PTR), X5 \ + ADDSD (A_PTR)(LDA*1), X6 \ + ADDSD (A_PTR)(LDA*2), X7 \ + ADDSD (A_PTR)(LDA3*1), X8 \ + MOVSD X5, (A_PTR) \ + MOVSD X6, (A_PTR)(LDA*1) \ + MOVSD X7, (A_PTR)(LDA*2) \ + MOVSD X8, (A_PTR)(LDA3*1) \ + ADDQ $SIZE, A_PTR + +#define KERNEL_2x4 \ + MOVUPS X5, X7 \ + MOVUPS X6, X8 \ + MULPD X1, X5 \ + MULPD X1, X6 \ + MULPD X2, X7 \ + MULPD X2, X8 + +#define STORE_2x4 \ + MOVUPS (A_PTR), X9 \ + ADDPD X9, X5 \ + MOVUPS 2*SIZE(A_PTR), X10 \ + ADDPD X10, X6 \ + MOVUPS (A_PTR)(LDA*1), X11 \ + ADDPD X11, X7 \ + MOVUPS 2*SIZE(A_PTR)(LDA*1), X12 \ + ADDPD X12, X8 \ + MOVUPS X5, (A_PTR) \ + MOVUPS X6, 2*SIZE(A_PTR) \ + MOVUPS X7, (A_PTR)(LDA*1) \ + MOVUPS X8, 2*SIZE(A_PTR)(LDA*1) \ + ADDQ $4*SIZE, A_PTR + +#define KERNEL_2x2 \ + MOVUPS X5, X6 \ + MULPD X1, X5 \ + MULPD X2, X6 + +#define STORE_2x2 \ + MOVUPS (A_PTR), X7 \ + ADDPD X7, X5 \ + MOVUPS (A_PTR)(LDA*1), X8 \ + ADDPD X8, X6 \ + MOVUPS X5, (A_PTR) \ + MOVUPS X6, (A_PTR)(LDA*1) \ + ADDQ $2*SIZE, A_PTR + +#define KERNEL_2x1 \ + MOVSD (Y_PTR), X5 \ + MOVSD X5, X6 \ + MULSD X1, X5 \ + MULSD X2, X6 + +#define STORE_2x1 \ + ADDSD (A_PTR), X5 \ + ADDSD (A_PTR)(LDA*1), X6 \ + MOVSD X5, (A_PTR) \ + MOVSD X6, (A_PTR)(LDA*1) \ + ADDQ $SIZE, A_PTR + +#define KERNEL_1x4 \ + MULPD X1, X5 \ + MULPD X1, X6 + +#define STORE_1x4 \ + MOVUPS (A_PTR), X7 \ + ADDPD X7, X5 \ + MOVUPS 2*SIZE(A_PTR), X8 \ + ADDPD X8, X6 \ + MOVUPS X5, (A_PTR) \ + MOVUPS X6, 2*SIZE(A_PTR) \ + ADDQ $4*SIZE, A_PTR + +#define KERNEL_1x2 \ + MULPD X1, X5 + +#define STORE_1x2 \ + MOVUPS (A_PTR), X6 \ + ADDPD X6, X5 \ + MOVUPS X5, (A_PTR) \ + ADDQ $2*SIZE, A_PTR + +#define KERNEL_1x1 \ + MOVSD (Y_PTR), X5 \ + MULSD X1, X5 + +#define STORE_1x1 \ + ADDSD (A_PTR), X5 \ + MOVSD X5, (A_PTR) \ + ADDQ $SIZE, A_PTR + +// func Ger(m, n uintptr, alpha float64, +// x []float64, incX uintptr, +// y []float64, incY uintptr, +// a []float64, lda uintptr) +TEXT ·Ger(SB), NOSPLIT, $0 + MOVQ M_DIM, M + MOVQ N_DIM, N + CMPQ M, $0 + JE end + CMPQ N, $0 + JE end + + MOVDDUP alpha+16(FP), ALPHA + + MOVQ x_base+24(FP), X_PTR + MOVQ y_base+56(FP), Y_PTR + MOVQ a_base+88(FP), A_ROW + MOVQ incX+48(FP), INC_X // INC_X = incX * sizeof(float64) + SHLQ $3, INC_X + MOVQ lda+112(FP), LDA // LDA = LDA * sizeof(float64) + SHLQ $3, LDA + LEAQ (LDA)(LDA*2), LDA3 // LDA3 = LDA * 3 + LEAQ (INC_X)(INC_X*2), INC3_X // INC3_X = INC_X * 3 + MOVQ A_ROW, A_PTR + + XORQ TMP2, TMP2 + MOVQ M, TMP1 + SUBQ $1, TMP1 + IMULQ INC_X, TMP1 + NEGQ TMP1 + CMPQ INC_X, $0 + CMOVQLT TMP1, TMP2 + LEAQ (X_PTR)(TMP2*SIZE), X_PTR + + CMPQ incY+80(FP), $1 // Check for dense vector Y (fast-path) + JG inc + JL end + + SHRQ $2, M + JZ r2 + +r4: + // LOAD 4 + LOAD4 + + MOVQ N_DIM, N + SHRQ $2, N + JZ r4c2 + +r4c4: + // 4x4 KERNEL + KERNEL_LOAD4 + KERNEL_4x4 + STORE_4x4 + + ADDQ $4*SIZE, Y_PTR + + DECQ N + JNZ r4c4 + + // Reload ALPHA after it's clobbered by STORE_4x4 + MOVDDUP alpha+16(FP), ALPHA + +r4c2: + TESTQ $2, N_DIM + JZ r4c1 + + // 4x2 KERNEL + KERNEL_LOAD2 + KERNEL_4x2 + STORE_4x2 + + ADDQ $2*SIZE, Y_PTR + +r4c1: + TESTQ $1, N_DIM + JZ r4end + + // 4x1 KERNEL + KERNEL_4x1 + STORE_4x1 + + ADDQ $SIZE, Y_PTR + +r4end: + LEAQ (X_PTR)(INC_X*4), X_PTR + MOVQ Y, Y_PTR + LEAQ (A_ROW)(LDA*4), A_ROW + MOVQ A_ROW, A_PTR + + DECQ M + JNZ r4 + +r2: + TESTQ $2, M_DIM + JZ r1 + + // LOAD 2 + LOAD2 + + MOVQ N_DIM, N + SHRQ $2, N + JZ r2c2 + +r2c4: + // 2x4 KERNEL + KERNEL_LOAD4 + KERNEL_2x4 + STORE_2x4 + + ADDQ $4*SIZE, Y_PTR + + DECQ N + JNZ r2c4 + +r2c2: + TESTQ $2, N_DIM + JZ r2c1 + + // 2x2 KERNEL + KERNEL_LOAD2 + KERNEL_2x2 + STORE_2x2 + + ADDQ $2*SIZE, Y_PTR + +r2c1: + TESTQ $1, N_DIM + JZ r2end + + // 2x1 KERNEL + KERNEL_2x1 + STORE_2x1 + + ADDQ $SIZE, Y_PTR + +r2end: + LEAQ (X_PTR)(INC_X*2), X_PTR + MOVQ Y, Y_PTR + LEAQ (A_ROW)(LDA*2), A_ROW + MOVQ A_ROW, A_PTR + +r1: + TESTQ $1, M_DIM + JZ end + + // LOAD 1 + LOAD1 + + MOVQ N_DIM, N + SHRQ $2, N + JZ r1c2 + +r1c4: + // 1x4 KERNEL + KERNEL_LOAD4 + KERNEL_1x4 + STORE_1x4 + + ADDQ $4*SIZE, Y_PTR + + DECQ N + JNZ r1c4 + +r1c2: + TESTQ $2, N_DIM + JZ r1c1 + + // 1x2 KERNEL + KERNEL_LOAD2 + KERNEL_1x2 + STORE_1x2 + + ADDQ $2*SIZE, Y_PTR + +r1c1: + TESTQ $1, N_DIM + JZ end + + // 1x1 KERNEL + KERNEL_1x1 + STORE_1x1 + + ADDQ $SIZE, Y_PTR + +end: + RET + +inc: // Algorithm for incY != 1 ( split loads in kernel ) + + MOVQ incY+80(FP), INC_Y // INC_Y = incY * sizeof(float64) + SHLQ $3, INC_Y + LEAQ (INC_Y)(INC_Y*2), INC3_Y // INC3_Y = INC_Y * 3 + + XORQ TMP2, TMP2 + MOVQ N, TMP1 + SUBQ $1, TMP1 + IMULQ INC_Y, TMP1 + NEGQ TMP1 + CMPQ INC_Y, $0 + CMOVQLT TMP1, TMP2 + LEAQ (Y_PTR)(TMP2*SIZE), Y_PTR + + SHRQ $2, M + JZ inc_r2 + +inc_r4: + // LOAD 4 + LOAD4 + + MOVQ N_DIM, N + SHRQ $2, N + JZ inc_r4c2 + +inc_r4c4: + // 4x4 KERNEL + KERNEL_LOAD4_INC + KERNEL_4x4 + STORE_4x4 + + LEAQ (Y_PTR)(INC_Y*4), Y_PTR + DECQ N + JNZ inc_r4c4 + + // Reload ALPHA after it's clobbered by STORE_4x4 + MOVDDUP alpha+16(FP), ALPHA + +inc_r4c2: + TESTQ $2, N_DIM + JZ inc_r4c1 + + // 4x2 KERNEL + KERNEL_LOAD2_INC + KERNEL_4x2 + STORE_4x2 + + LEAQ (Y_PTR)(INC_Y*2), Y_PTR + +inc_r4c1: + TESTQ $1, N_DIM + JZ inc_r4end + + // 4x1 KERNEL + KERNEL_4x1 + STORE_4x1 + + ADDQ INC_Y, Y_PTR + +inc_r4end: + LEAQ (X_PTR)(INC_X*4), X_PTR + MOVQ Y, Y_PTR + LEAQ (A_ROW)(LDA*4), A_ROW + MOVQ A_ROW, A_PTR + + DECQ M + JNZ inc_r4 + +inc_r2: + TESTQ $2, M_DIM + JZ inc_r1 + + // LOAD 2 + LOAD2 + + MOVQ N_DIM, N + SHRQ $2, N + JZ inc_r2c2 + +inc_r2c4: + // 2x4 KERNEL + KERNEL_LOAD4_INC + KERNEL_2x4 + STORE_2x4 + + LEAQ (Y_PTR)(INC_Y*4), Y_PTR + DECQ N + JNZ inc_r2c4 + +inc_r2c2: + TESTQ $2, N_DIM + JZ inc_r2c1 + + // 2x2 KERNEL + KERNEL_LOAD2_INC + KERNEL_2x2 + STORE_2x2 + + LEAQ (Y_PTR)(INC_Y*2), Y_PTR + +inc_r2c1: + TESTQ $1, N_DIM + JZ inc_r2end + + // 2x1 KERNEL + KERNEL_2x1 + STORE_2x1 + + ADDQ INC_Y, Y_PTR + +inc_r2end: + LEAQ (X_PTR)(INC_X*2), X_PTR + MOVQ Y, Y_PTR + LEAQ (A_ROW)(LDA*2), A_ROW + MOVQ A_ROW, A_PTR + +inc_r1: + TESTQ $1, M_DIM + JZ end + + // LOAD 1 + LOAD1 + + MOVQ N_DIM, N + SHRQ $2, N + JZ inc_r1c2 + +inc_r1c4: + // 1x4 KERNEL + KERNEL_LOAD4_INC + KERNEL_1x4 + STORE_1x4 + + LEAQ (Y_PTR)(INC_Y*4), Y_PTR + DECQ N + JNZ inc_r1c4 + +inc_r1c2: + TESTQ $2, N_DIM + JZ inc_r1c1 + + // 1x2 KERNEL + KERNEL_LOAD2_INC + KERNEL_1x2 + STORE_1x2 + + LEAQ (Y_PTR)(INC_Y*2), Y_PTR + +inc_r1c1: + TESTQ $1, N_DIM + JZ end + + // 1x1 KERNEL + KERNEL_1x1 + STORE_1x1 + + ADDQ INC_Y, Y_PTR + +inc_end: + RET diff --git a/vendor/gonum.org/v1/gonum/internal/asm/f64/l1norm_amd64.s b/vendor/gonum.org/v1/gonum/internal/asm/f64/l1norm_amd64.s new file mode 100644 index 00000000..b4b1fd02 --- /dev/null +++ b/vendor/gonum.org/v1/gonum/internal/asm/f64/l1norm_amd64.s @@ -0,0 +1,58 @@ +// Copyright ©2016 The Gonum Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// +build !noasm,!gccgo,!safe + +#include "textflag.h" + +// func L1Dist(s, t []float64) float64 +TEXT ·L1Dist(SB), NOSPLIT, $0 + MOVQ s_base+0(FP), DI // DI = &s + MOVQ t_base+24(FP), SI // SI = &t + MOVQ s_len+8(FP), CX // CX = len(s) + CMPQ t_len+32(FP), CX // CX = max( CX, len(t) ) + CMOVQLE t_len+32(FP), CX + PXOR X3, X3 // norm = 0 + CMPQ CX, $0 // if CX == 0 { return 0 } + JE l1_end + XORQ AX, AX // i = 0 + MOVQ CX, BX + ANDQ $1, BX // BX = CX % 2 + SHRQ $1, CX // CX = floor( CX / 2 ) + JZ l1_tail_start // if CX == 0 { return 0 } + +l1_loop: // Loop unrolled 2x do { + MOVUPS (SI)(AX*8), X0 // X0 = t[i:i+1] + MOVUPS (DI)(AX*8), X1 // X1 = s[i:i+1] + MOVAPS X0, X2 + SUBPD X1, X0 + SUBPD X2, X1 + MAXPD X1, X0 // X0 = max( X0 - X1, X1 - X0 ) + ADDPD X0, X3 // norm += X0 + ADDQ $2, AX // i += 2 + LOOP l1_loop // } while --CX > 0 + CMPQ BX, $0 // if BX == 0 { return } + JE l1_end + +l1_tail_start: // Reset loop registers + MOVQ BX, CX // Loop counter: CX = BX + PXOR X0, X0 // reset X0, X1 to break dependencies + PXOR X1, X1 + +l1_tail: + MOVSD (SI)(AX*8), X0 // X0 = t[i] + MOVSD (DI)(AX*8), X1 // x1 = s[i] + MOVAPD X0, X2 + SUBSD X1, X0 + SUBSD X2, X1 + MAXSD X1, X0 // X0 = max( X0 - X1, X1 - X0 ) + ADDSD X0, X3 // norm += X0 + +l1_end: + MOVAPS X3, X2 + SHUFPD $1, X2, X2 + ADDSD X3, X2 // X2 = X3[1] + X3[0] + MOVSD X2, ret+48(FP) // return X2 + RET + diff --git a/vendor/gonum.org/v1/gonum/internal/asm/f64/l2norm_amd64.s b/vendor/gonum.org/v1/gonum/internal/asm/f64/l2norm_amd64.s new file mode 100644 index 00000000..86e01c87 --- /dev/null +++ b/vendor/gonum.org/v1/gonum/internal/asm/f64/l2norm_amd64.s @@ -0,0 +1,109 @@ +// Copyright ©2019 The Gonum Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// +build !noasm,!gccgo,!safe + +#include "textflag.h" + +#define SUMSQ X0 +#define ABSX X1 +#define SCALE X2 +#define ZERO X3 +#define TMP X4 +#define ABSMASK X5 +#define INF X7 +#define INFMASK X11 +#define NANMASK X12 +#define IDX AX +#define LEN SI +#define X_ DI + +#define ABSMASK_DATA l2nrodata<>+0(SB) +#define INF_DATA l2nrodata<>+8(SB) +#define NAN_DATA l2nrodata<>+16(SB) +// AbsMask +DATA l2nrodata<>+0(SB)/8, $0x7FFFFFFFFFFFFFFF +// Inf +DATA l2nrodata<>+8(SB)/8, $0x7FF0000000000000 +// NaN +DATA l2nrodata<>+16(SB)/8, $0xFFF8000000000000 +GLOBL l2nrodata<>+0(SB), RODATA, $24 + +// L2NormUnitary returns the L2-norm of x. +// func L2NormUnitary(x []float64) (norm float64) +TEXT ·L2NormUnitary(SB), NOSPLIT, $0 + MOVQ x_len+8(FP), LEN // LEN = len(x) + MOVQ x_base+0(FP), X_ + PXOR ZERO, ZERO + CMPQ LEN, $0 // if LEN == 0 { return 0 } + JZ retZero + + PXOR INFMASK, INFMASK + PXOR NANMASK, NANMASK + MOVSD $1.0, SUMSQ // ssq = 1 + XORPS SCALE, SCALE + MOVSD ABSMASK_DATA, ABSMASK + MOVSD INF_DATA, INF + XORQ IDX, IDX // idx == 0 + +initZero: // for ;x[i]==0; i++ {} + // Skip all leading zeros, to avoid divide by zero NaN + MOVSD (X_)(IDX*8), ABSX // absxi = x[i] + UCOMISD ABSX, ZERO + JP retNaN // if isNaN(x[i]) { return NaN } + JNE loop // if x[i] != 0 { goto loop } + INCQ IDX // i++ + CMPQ IDX, LEN + JE retZero // if i == LEN { return 0 } + JMP initZero + +loop: + MOVSD (X_)(IDX*8), ABSX // absxi = x[i] + MOVUPS ABSX, TMP + CMPSD ABSX, TMP, $3 + ORPD TMP, NANMASK // NANMASK = NANMASK | IsNaN(absxi) + MOVSD INF, TMP + ANDPD ABSMASK, ABSX // absxi == Abs(absxi) + CMPSD ABSX, TMP, $0 + ORPD TMP, INFMASK // INFMASK = INFMASK | IsInf(absxi) + UCOMISD SCALE, ABSX + JA adjScale // IF SCALE > ABSXI { goto adjScale } + + DIVSD SCALE, ABSX // absxi = scale / absxi + MULSD ABSX, ABSX // absxi *= absxi + ADDSD ABSX, SUMSQ // sumsq += absxi + INCQ IDX // i++ + CMPQ IDX, LEN + JNE loop // if i < LEN { continue } + JMP retSum // if i == LEN { goto retSum } + +adjScale: // Scale > Absxi + DIVSD ABSX, SCALE // tmp = absxi / scale + MULSD SCALE, SUMSQ // sumsq *= tmp + MULSD SCALE, SUMSQ // sumsq *= tmp + ADDSD $1.0, SUMSQ // sumsq += 1 + MOVUPS ABSX, SCALE // scale = absxi + INCQ IDX // i++ + CMPQ IDX, LEN + JNE loop // if i < LEN { continue } + +retSum: // Calculate return value + SQRTSD SUMSQ, SUMSQ // sumsq = sqrt(sumsq) + MULSD SCALE, SUMSQ // sumsq += scale + MOVQ SUMSQ, R10 // tmp = sumsq + UCOMISD ZERO, INFMASK + CMOVQPS INF_DATA, R10 // if INFMASK { tmp = INF } + UCOMISD ZERO, NANMASK + CMOVQPS NAN_DATA, R10 // if NANMASK { tmp = NaN } + MOVQ R10, norm+24(FP) // return tmp + RET + +retZero: + MOVSD ZERO, norm+24(FP) // return 0 + RET + +retNaN: + MOVSD NAN_DATA, TMP // return NaN + MOVSD TMP, norm+24(FP) + RET diff --git a/vendor/gonum.org/v1/gonum/internal/asm/f64/l2norm_noasm.go b/vendor/gonum.org/v1/gonum/internal/asm/f64/l2norm_noasm.go new file mode 100644 index 00000000..bfb8fba9 --- /dev/null +++ b/vendor/gonum.org/v1/gonum/internal/asm/f64/l2norm_noasm.go @@ -0,0 +1,93 @@ +// Copyright ©2019 The Gonum Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +//go:build !amd64 || noasm || gccgo || safe +// +build !amd64 noasm gccgo safe + +package f64 + +import "math" + +// L2NormUnitary returns the L2-norm of x. +func L2NormUnitary(x []float64) (norm float64) { + var scale float64 + sumSquares := 1.0 + for _, v := range x { + if v == 0 { + continue + } + absxi := math.Abs(v) + if math.IsNaN(absxi) { + return math.NaN() + } + if scale < absxi { + s := scale / absxi + sumSquares = 1 + sumSquares*s*s + scale = absxi + } else { + s := absxi / scale + sumSquares += s * s + } + } + if math.IsInf(scale, 1) { + return math.Inf(1) + } + return scale * math.Sqrt(sumSquares) +} + +// L2NormInc returns the L2-norm of x. +func L2NormInc(x []float64, n, incX uintptr) (norm float64) { + var scale float64 + sumSquares := 1.0 + for ix := uintptr(0); ix < n*incX; ix += incX { + val := x[ix] + if val == 0 { + continue + } + absxi := math.Abs(val) + if math.IsNaN(absxi) { + return math.NaN() + } + if scale < absxi { + s := scale / absxi + sumSquares = 1 + sumSquares*s*s + scale = absxi + } else { + s := absxi / scale + sumSquares += s * s + } + } + if math.IsInf(scale, 1) { + return math.Inf(1) + } + return scale * math.Sqrt(sumSquares) +} + +// L2DistanceUnitary returns the L2-norm of x-y. +func L2DistanceUnitary(x, y []float64) (norm float64) { + var scale float64 + sumSquares := 1.0 + for i, v := range x { + v -= y[i] + if v == 0 { + continue + } + absxi := math.Abs(v) + if math.IsNaN(absxi) { + return math.NaN() + } + if scale < absxi { + s := scale / absxi + sumSquares = 1 + sumSquares*s*s + scale = absxi + } else { + s := absxi / scale + sumSquares += s * s + } + } + if math.IsInf(scale, 1) { + return math.Inf(1) + } + return scale * math.Sqrt(sumSquares) +} diff --git a/vendor/gonum.org/v1/gonum/internal/asm/f64/l2normdist_amd64.s b/vendor/gonum.org/v1/gonum/internal/asm/f64/l2normdist_amd64.s new file mode 100644 index 00000000..10dcae40 --- /dev/null +++ b/vendor/gonum.org/v1/gonum/internal/asm/f64/l2normdist_amd64.s @@ -0,0 +1,115 @@ +// Copyright ©2019 The Gonum Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// +build !noasm,!gccgo,!safe + +#include "textflag.h" + +#define SUMSQ X0 +#define ABSX X1 +#define SCALE X2 +#define ZERO X3 +#define TMP X4 +#define ABSMASK X5 +#define INF X7 +#define INFMASK X11 +#define NANMASK X12 +#define IDX AX +#define X_ DI +#define Y_ BX +#define LEN SI + +#define ABSMASK_DATA l2nrodata<>+0(SB) +#define INF_DATA l2nrodata<>+8(SB) +#define NAN_DATA l2nrodata<>+16(SB) +// AbsMask +DATA l2nrodata<>+0(SB)/8, $0x7FFFFFFFFFFFFFFF +// Inf +DATA l2nrodata<>+8(SB)/8, $0x7FF0000000000000 +// NaN +DATA l2nrodata<>+16(SB)/8, $0xFFF8000000000000 +GLOBL l2nrodata<>+0(SB), RODATA, $24 + +// L2DistanceUnitary returns the L2-norm of x-y. +// func L2DistanceUnitary(x,y []float64) (norm float64) +TEXT ·L2DistanceUnitary(SB), NOSPLIT, $0 + MOVQ x_base+0(FP), X_ + MOVQ y_base+24(FP), Y_ + PXOR ZERO, ZERO + MOVQ x_len+8(FP), LEN // LEN = min( len(x), len(y) ) + CMPQ y_len+32(FP), LEN + CMOVQLE y_len+32(FP), LEN + CMPQ LEN, $0 // if LEN == 0 { return 0 } + JZ retZero + + PXOR INFMASK, INFMASK + PXOR NANMASK, NANMASK + MOVSD $1.0, SUMSQ // ssq = 1 + XORPS SCALE, SCALE + MOVSD ABSMASK_DATA, ABSMASK + MOVSD INF_DATA, INF + XORQ IDX, IDX // idx == 0 + +initZero: // for ;x[i]==0; i++ {} + // Skip all leading zeros, to avoid divide by zero NaN + MOVSD (X_)(IDX*8), ABSX // absxi = x[i] + SUBSD (Y_)(IDX*8), ABSX // absxi = x[i]-y[i] + UCOMISD ABSX, ZERO + JP retNaN // if isNaN(absxi) { return NaN } + JNE loop // if absxi != 0 { goto loop } + INCQ IDX // i++ + CMPQ IDX, LEN + JE retZero // if i == LEN { return 0 } + JMP initZero + +loop: + MOVSD (X_)(IDX*8), ABSX // absxi = x[i] + SUBSD (Y_)(IDX*8), ABSX // absxi = x[i]-y[i] + MOVUPS ABSX, TMP + CMPSD ABSX, TMP, $3 + ORPD TMP, NANMASK // NANMASK = NANMASK | IsNaN(absxi) + MOVSD INF, TMP + ANDPD ABSMASK, ABSX // absxi == Abs(absxi) + CMPSD ABSX, TMP, $0 + ORPD TMP, INFMASK // INFMASK = INFMASK | IsInf(absxi) + UCOMISD SCALE, ABSX + JA adjScale // IF SCALE > ABSXI { goto adjScale } + + DIVSD SCALE, ABSX // absxi = scale / absxi + MULSD ABSX, ABSX // absxi *= absxi + ADDSD ABSX, SUMSQ // sumsq += absxi + INCQ IDX // i++ + CMPQ IDX, LEN + JNE loop // if i < LEN { continue } + JMP retSum // if i == LEN { goto retSum } + +adjScale: // Scale > Absxi + DIVSD ABSX, SCALE // tmp = absxi / scale + MULSD SCALE, SUMSQ // sumsq *= tmp + MULSD SCALE, SUMSQ // sumsq *= tmp + ADDSD $1.0, SUMSQ // sumsq += 1 + MOVUPS ABSX, SCALE // scale = absxi + INCQ IDX // i++ + CMPQ IDX, LEN + JNE loop // if i < LEN { continue } + +retSum: // Calculate return value + SQRTSD SUMSQ, SUMSQ // sumsq = sqrt(sumsq) + MULSD SCALE, SUMSQ // sumsq += scale + MOVQ SUMSQ, R10 // tmp = sumsq + UCOMISD ZERO, INFMASK + CMOVQPS INF_DATA, R10 // if INFMASK { tmp = INF } + UCOMISD ZERO, NANMASK + CMOVQPS NAN_DATA, R10 // if NANMASK { tmp = NaN } + MOVQ R10, norm+48(FP) // return tmp + RET + +retZero: + MOVSD ZERO, norm+48(FP) // return 0 + RET + +retNaN: + MOVSD NAN_DATA, TMP // return NaN + MOVSD TMP, norm+48(FP) + RET diff --git a/vendor/gonum.org/v1/gonum/internal/asm/f64/l2norminc_amd64.s b/vendor/gonum.org/v1/gonum/internal/asm/f64/l2norminc_amd64.s new file mode 100644 index 00000000..8341db93 --- /dev/null +++ b/vendor/gonum.org/v1/gonum/internal/asm/f64/l2norminc_amd64.s @@ -0,0 +1,110 @@ +// Copyright ©2019 The Gonum Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// +build !noasm,!gccgo,!safe + +#include "textflag.h" + +#define SUMSQ X0 +#define ABSX X1 +#define SCALE X2 +#define ZERO X3 +#define TMP X4 +#define ABSMASK X5 +#define INF X7 +#define INFMASK X11 +#define NANMASK X12 +#define IDX AX +#define LEN SI +#define INC BX +#define X_ DI + +#define ABSMASK_DATA l2nrodata<>+0(SB) +#define INF_DATA l2nrodata<>+8(SB) +#define NAN_DATA l2nrodata<>+16(SB) +// AbsMask +DATA l2nrodata<>+0(SB)/8, $0x7FFFFFFFFFFFFFFF +// Inf +DATA l2nrodata<>+8(SB)/8, $0x7FF0000000000000 +// NaN +DATA l2nrodata<>+16(SB)/8, $0xFFF8000000000000 +GLOBL l2nrodata<>+0(SB), RODATA, $24 + +// func L2NormInc(x []float64, n, incX uintptr) (norm float64) +TEXT ·L2NormInc(SB), NOSPLIT, $0 + MOVQ n+24(FP), LEN // LEN = len(x) + MOVQ incX+32(FP), INC + MOVQ x_base+0(FP), X_ + XORPS ZERO, ZERO + CMPQ LEN, $0 // if LEN == 0 { return 0 } + JZ retZero + + XORPS INFMASK, INFMASK + XORPS NANMASK, NANMASK + MOVSD $1.0, SUMSQ // ssq = 1 + XORPS SCALE, SCALE + MOVSD ABSMASK_DATA, ABSMASK + MOVSD INF_DATA, INF + SHLQ $3, INC // INC *= sizeof(float64) + +initZero: // for ;x[i]==0; i++ {} + // Skip all leading zeros, to avoid divide by zero NaN + MOVSD (X_), ABSX // absxi = x[i] + UCOMISD ABSX, ZERO + JP retNaN // if isNaN(x[i]) { return NaN } + JNZ loop // if x[i] != 0 { goto loop } + ADDQ INC, X_ // i += INC + DECQ LEN // LEN-- + JZ retZero // if LEN == 0 { return 0 } + JMP initZero + +loop: + MOVSD (X_), ABSX // absxi = x[i] + MOVUPS ABSX, TMP + CMPSD ABSX, TMP, $3 + ORPD TMP, NANMASK // NANMASK = NANMASK | IsNaN(absxi) + MOVSD INF, TMP + ANDPD ABSMASK, ABSX // absxi == Abs(absxi) + CMPSD ABSX, TMP, $0 + ORPD TMP, INFMASK // INFMASK = INFMASK | IsInf(absxi) + UCOMISD SCALE, ABSX + JA adjScale // IF SCALE > ABSXI { goto adjScale } + + DIVSD SCALE, ABSX // absxi = scale / absxi + MULSD ABSX, ABSX // absxi *= absxi + ADDSD ABSX, SUMSQ // sumsq += absxi + ADDQ INC, X_ // i += INC + DECQ LEN // LEN-- + JNZ loop // if LEN > 0 { continue } + JMP retSum // if LEN == 0 { goto retSum } + +adjScale: // Scale > Absxi + DIVSD ABSX, SCALE // tmp = absxi / scale + MULSD SCALE, SUMSQ // sumsq *= tmp + MULSD SCALE, SUMSQ // sumsq *= tmp + ADDSD $1.0, SUMSQ // sumsq += 1 + MOVUPS ABSX, SCALE // scale = absxi + ADDQ INC, X_ // i += INC + DECQ LEN // LEN-- + JNZ loop // if LEN > 0 { continue } + +retSum: // Calculate return value + SQRTSD SUMSQ, SUMSQ // sumsq = sqrt(sumsq) + MULSD SCALE, SUMSQ // sumsq += scale + MOVQ SUMSQ, R10 // tmp = sumsq + UCOMISD ZERO, INFMASK + CMOVQPS INF_DATA, R10 // if INFMASK { tmp = INF } + UCOMISD ZERO, NANMASK + CMOVQPS NAN_DATA, R10 // if NANMASK { tmp = NaN } + MOVQ R10, norm+40(FP) // return tmp + RET + +retZero: + MOVSD ZERO, norm+40(FP) // return 0 + RET + +retNaN: + MOVSD NAN_DATA, TMP // return NaN + MOVSD TMP, norm+40(FP) + RET diff --git a/vendor/gonum.org/v1/gonum/internal/asm/f64/linfnorm_amd64.s b/vendor/gonum.org/v1/gonum/internal/asm/f64/linfnorm_amd64.s new file mode 100644 index 00000000..ac18b481 --- /dev/null +++ b/vendor/gonum.org/v1/gonum/internal/asm/f64/linfnorm_amd64.s @@ -0,0 +1,57 @@ +// Copyright ©2016 The Gonum Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// +build !noasm,!gccgo,!safe + +#include "textflag.h" + +// func LinfDist(s, t []float64) float64 +TEXT ·LinfDist(SB), NOSPLIT, $0 + MOVQ s_base+0(FP), DI // DI = &s + MOVQ t_base+24(FP), SI // SI = &t + MOVQ s_len+8(FP), CX // CX = len(s) + CMPQ t_len+32(FP), CX // CX = max( CX, len(t) ) + CMOVQLE t_len+32(FP), CX + PXOR X3, X3 // norm = 0 + CMPQ CX, $0 // if CX == 0 { return 0 } + JE l1_end + XORQ AX, AX // i = 0 + MOVQ CX, BX + ANDQ $1, BX // BX = CX % 2 + SHRQ $1, CX // CX = floor( CX / 2 ) + JZ l1_tail_start // if CX == 0 { return 0 } + +l1_loop: // Loop unrolled 2x do { + MOVUPS (SI)(AX*8), X0 // X0 = t[i:i+1] + MOVUPS (DI)(AX*8), X1 // X1 = s[i:i+1] + MOVAPS X0, X2 + SUBPD X1, X0 + SUBPD X2, X1 + MAXPD X1, X0 // X0 = max( X0 - X1, X1 - X0 ) + MAXPD X0, X3 // norm = max( norm, X0 ) + ADDQ $2, AX // i += 2 + LOOP l1_loop // } while --CX > 0 + CMPQ BX, $0 // if BX == 0 { return } + JE l1_end + +l1_tail_start: // Reset loop registers + MOVQ BX, CX // Loop counter: CX = BX + PXOR X0, X0 // reset X0, X1 to break dependencies + PXOR X1, X1 + +l1_tail: + MOVSD (SI)(AX*8), X0 // X0 = t[i] + MOVSD (DI)(AX*8), X1 // X1 = s[i] + MOVAPD X0, X2 + SUBSD X1, X0 + SUBSD X2, X1 + MAXSD X1, X0 // X0 = max( X0 - X1, X1 - X0 ) + MAXSD X0, X3 // norm = max( norm, X0 ) + +l1_end: + MOVAPS X3, X2 + SHUFPD $1, X2, X2 + MAXSD X3, X2 // X2 = max( X3[1], X3[0] ) + MOVSD X2, ret+48(FP) // return X2 + RET diff --git a/vendor/gonum.org/v1/gonum/internal/asm/f64/scal.go b/vendor/gonum.org/v1/gonum/internal/asm/f64/scal.go new file mode 100644 index 00000000..c95219e1 --- /dev/null +++ b/vendor/gonum.org/v1/gonum/internal/asm/f64/scal.go @@ -0,0 +1,62 @@ +// Copyright ©2016 The Gonum Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +//go:build !amd64 || noasm || gccgo || safe +// +build !amd64 noasm gccgo safe + +package f64 + +// ScalUnitary is +// +// for i := range x { +// x[i] *= alpha +// } +func ScalUnitary(alpha float64, x []float64) { + for i := range x { + x[i] *= alpha + } +} + +// ScalUnitaryTo is +// +// for i, v := range x { +// dst[i] = alpha * v +// } +func ScalUnitaryTo(dst []float64, alpha float64, x []float64) { + for i, v := range x { + dst[i] = alpha * v + } +} + +// ScalInc is +// +// var ix uintptr +// for i := 0; i < int(n); i++ { +// x[ix] *= alpha +// ix += incX +// } +func ScalInc(alpha float64, x []float64, n, incX uintptr) { + var ix uintptr + for i := 0; i < int(n); i++ { + x[ix] *= alpha + ix += incX + } +} + +// ScalIncTo is +// +// var idst, ix uintptr +// for i := 0; i < int(n); i++ { +// dst[idst] = alpha * x[ix] +// ix += incX +// idst += incDst +// } +func ScalIncTo(dst []float64, incDst uintptr, alpha float64, x []float64, n, incX uintptr) { + var idst, ix uintptr + for i := 0; i < int(n); i++ { + dst[idst] = alpha * x[ix] + ix += incX + idst += incDst + } +} diff --git a/vendor/gonum.org/v1/gonum/internal/asm/f64/scalinc_amd64.s b/vendor/gonum.org/v1/gonum/internal/asm/f64/scalinc_amd64.s new file mode 100644 index 00000000..d623a284 --- /dev/null +++ b/vendor/gonum.org/v1/gonum/internal/asm/f64/scalinc_amd64.s @@ -0,0 +1,113 @@ +// Copyright ©2016 The Gonum Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. +// +// Some of the loop unrolling code is copied from: +// http://golang.org/src/math/big/arith_amd64.s +// which is distributed under these terms: +// +// Copyright (c) 2012 The Go Authors. All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +// +build !noasm,!gccgo,!safe + +#include "textflag.h" + +#define X_PTR SI +#define LEN CX +#define TAIL BX +#define INC_X R8 +#define INCx3_X R9 +#define ALPHA X0 +#define ALPHA_2 X1 + +// func ScalInc(alpha float64, x []float64, n, incX uintptr) +TEXT ·ScalInc(SB), NOSPLIT, $0 + MOVSD alpha+0(FP), ALPHA // ALPHA = alpha + MOVQ x_base+8(FP), X_PTR // X_PTR = &x + MOVQ incX+40(FP), INC_X // INC_X = incX + SHLQ $3, INC_X // INC_X *= sizeof(float64) + MOVQ n+32(FP), LEN // LEN = n + CMPQ LEN, $0 + JE end // if LEN == 0 { return } + + MOVQ LEN, TAIL + ANDQ $3, TAIL // TAIL = LEN % 4 + SHRQ $2, LEN // LEN = floor( LEN / 4 ) + JZ tail_start // if LEN == 0 { goto tail_start } + + MOVUPS ALPHA, ALPHA_2 // ALPHA_2 = ALPHA for pipelining + LEAQ (INC_X)(INC_X*2), INCx3_X // INCx3_X = INC_X * 3 + +loop: // do { // x[i] *= alpha unrolled 4x. + MOVSD (X_PTR), X2 // X_i = x[i] + MOVSD (X_PTR)(INC_X*1), X3 + MOVSD (X_PTR)(INC_X*2), X4 + MOVSD (X_PTR)(INCx3_X*1), X5 + + MULSD ALPHA, X2 // X_i *= a + MULSD ALPHA_2, X3 + MULSD ALPHA, X4 + MULSD ALPHA_2, X5 + + MOVSD X2, (X_PTR) // x[i] = X_i + MOVSD X3, (X_PTR)(INC_X*1) + MOVSD X4, (X_PTR)(INC_X*2) + MOVSD X5, (X_PTR)(INCx3_X*1) + + LEAQ (X_PTR)(INC_X*4), X_PTR // X_PTR = &(X_PTR[incX*4]) + DECQ LEN + JNZ loop // } while --LEN > 0 + CMPQ TAIL, $0 + JE end // if TAIL == 0 { return } + +tail_start: // Reset loop registers + MOVQ TAIL, LEN // Loop counter: LEN = TAIL + SHRQ $1, LEN // LEN = floor( LEN / 2 ) + JZ tail_one + +tail_two: // do { + MOVSD (X_PTR), X2 // X_i = x[i] + MOVSD (X_PTR)(INC_X*1), X3 + MULSD ALPHA, X2 // X_i *= a + MULSD ALPHA, X3 + MOVSD X2, (X_PTR) // x[i] = X_i + MOVSD X3, (X_PTR)(INC_X*1) + + LEAQ (X_PTR)(INC_X*2), X_PTR // X_PTR = &(X_PTR[incX*2]) + + ANDQ $1, TAIL + JZ end + +tail_one: + MOVSD (X_PTR), X2 // X_i = x[i] + MULSD ALPHA, X2 // X_i *= ALPHA + MOVSD X2, (X_PTR) // x[i] = X_i + +end: + RET diff --git a/vendor/gonum.org/v1/gonum/internal/asm/f64/scalincto_amd64.s b/vendor/gonum.org/v1/gonum/internal/asm/f64/scalincto_amd64.s new file mode 100644 index 00000000..1c272209 --- /dev/null +++ b/vendor/gonum.org/v1/gonum/internal/asm/f64/scalincto_amd64.s @@ -0,0 +1,122 @@ +// Copyright ©2016 The Gonum Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. +// +// Some of the loop unrolling code is copied from: +// http://golang.org/src/math/big/arith_amd64.s +// which is distributed under these terms: +// +// Copyright (c) 2012 The Go Authors. All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +// +build !noasm,!gccgo,!safe + +#include "textflag.h" + +#define X_PTR SI +#define DST_PTR DI +#define LEN CX +#define TAIL BX +#define INC_X R8 +#define INCx3_X R9 +#define INC_DST R10 +#define INCx3_DST R11 +#define ALPHA X0 +#define ALPHA_2 X1 + +// func ScalIncTo(dst []float64, incDst uintptr, alpha float64, x []float64, n, incX uintptr) +TEXT ·ScalIncTo(SB), NOSPLIT, $0 + MOVQ dst_base+0(FP), DST_PTR // DST_PTR = &dst + MOVQ incDst+24(FP), INC_DST // INC_DST = incDst + SHLQ $3, INC_DST // INC_DST *= sizeof(float64) + MOVSD alpha+32(FP), ALPHA // ALPHA = alpha + MOVQ x_base+40(FP), X_PTR // X_PTR = &x + MOVQ n+64(FP), LEN // LEN = n + MOVQ incX+72(FP), INC_X // INC_X = incX + SHLQ $3, INC_X // INC_X *= sizeof(float64) + CMPQ LEN, $0 + JE end // if LEN == 0 { return } + + MOVQ LEN, TAIL + ANDQ $3, TAIL // TAIL = LEN % 4 + SHRQ $2, LEN // LEN = floor( LEN / 4 ) + JZ tail_start // if LEN == 0 { goto tail_start } + + MOVUPS ALPHA, ALPHA_2 // ALPHA_2 = ALPHA for pipelining + LEAQ (INC_X)(INC_X*2), INCx3_X // INCx3_X = INC_X * 3 + LEAQ (INC_DST)(INC_DST*2), INCx3_DST // INCx3_DST = INC_DST * 3 + +loop: // do { // x[i] *= alpha unrolled 4x. + MOVSD (X_PTR), X2 // X_i = x[i] + MOVSD (X_PTR)(INC_X*1), X3 + MOVSD (X_PTR)(INC_X*2), X4 + MOVSD (X_PTR)(INCx3_X*1), X5 + + MULSD ALPHA, X2 // X_i *= a + MULSD ALPHA_2, X3 + MULSD ALPHA, X4 + MULSD ALPHA_2, X5 + + MOVSD X2, (DST_PTR) // dst[i] = X_i + MOVSD X3, (DST_PTR)(INC_DST*1) + MOVSD X4, (DST_PTR)(INC_DST*2) + MOVSD X5, (DST_PTR)(INCx3_DST*1) + + LEAQ (X_PTR)(INC_X*4), X_PTR // X_PTR = &(X_PTR[incX*4]) + LEAQ (DST_PTR)(INC_DST*4), DST_PTR // DST_PTR = &(DST_PTR[incDst*4]) + DECQ LEN + JNZ loop // } while --LEN > 0 + CMPQ TAIL, $0 + JE end // if TAIL == 0 { return } + +tail_start: // Reset loop registers + MOVQ TAIL, LEN // Loop counter: LEN = TAIL + SHRQ $1, LEN // LEN = floor( LEN / 2 ) + JZ tail_one + +tail_two: + MOVSD (X_PTR), X2 // X_i = x[i] + MOVSD (X_PTR)(INC_X*1), X3 + MULSD ALPHA, X2 // X_i *= a + MULSD ALPHA, X3 + MOVSD X2, (DST_PTR) // dst[i] = X_i + MOVSD X3, (DST_PTR)(INC_DST*1) + + LEAQ (X_PTR)(INC_X*2), X_PTR // X_PTR = &(X_PTR[incX*2]) + LEAQ (DST_PTR)(INC_DST*2), DST_PTR // DST_PTR = &(DST_PTR[incDst*2]) + + ANDQ $1, TAIL + JZ end + +tail_one: + MOVSD (X_PTR), X2 // X_i = x[i] + MULSD ALPHA, X2 // X_i *= ALPHA + MOVSD X2, (DST_PTR) // x[i] = X_i + +end: + RET diff --git a/vendor/gonum.org/v1/gonum/internal/asm/f64/scalunitary_amd64.s b/vendor/gonum.org/v1/gonum/internal/asm/f64/scalunitary_amd64.s new file mode 100644 index 00000000..6e8f5ca6 --- /dev/null +++ b/vendor/gonum.org/v1/gonum/internal/asm/f64/scalunitary_amd64.s @@ -0,0 +1,112 @@ +// Copyright ©2016 The Gonum Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. +// +// Some of the loop unrolling code is copied from: +// http://golang.org/src/math/big/arith_amd64.s +// which is distributed under these terms: +// +// Copyright (c) 2012 The Go Authors. All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +// +build !noasm,!gccgo,!safe + +#include "textflag.h" + +#define MOVDDUP_ALPHA LONG $0x44120FF2; WORD $0x0824 // @ MOVDDUP XMM0, 8[RSP] + +#define X_PTR SI +#define DST_PTR DI +#define IDX AX +#define LEN CX +#define TAIL BX +#define ALPHA X0 +#define ALPHA_2 X1 + +// func ScalUnitary(alpha float64, x []float64) +TEXT ·ScalUnitary(SB), NOSPLIT, $0 + MOVDDUP_ALPHA // ALPHA = { alpha, alpha } + MOVQ x_base+8(FP), X_PTR // X_PTR = &x + MOVQ x_len+16(FP), LEN // LEN = len(x) + CMPQ LEN, $0 + JE end // if LEN == 0 { return } + XORQ IDX, IDX // IDX = 0 + + MOVQ LEN, TAIL + ANDQ $7, TAIL // TAIL = LEN % 8 + SHRQ $3, LEN // LEN = floor( LEN / 8 ) + JZ tail_start // if LEN == 0 { goto tail_start } + + MOVUPS ALPHA, ALPHA_2 + +loop: // do { // x[i] *= alpha unrolled 8x. + MOVUPS (X_PTR)(IDX*8), X2 // X_i = x[i] + MOVUPS 16(X_PTR)(IDX*8), X3 + MOVUPS 32(X_PTR)(IDX*8), X4 + MOVUPS 48(X_PTR)(IDX*8), X5 + + MULPD ALPHA, X2 // X_i *= ALPHA + MULPD ALPHA_2, X3 + MULPD ALPHA, X4 + MULPD ALPHA_2, X5 + + MOVUPS X2, (X_PTR)(IDX*8) // x[i] = X_i + MOVUPS X3, 16(X_PTR)(IDX*8) + MOVUPS X4, 32(X_PTR)(IDX*8) + MOVUPS X5, 48(X_PTR)(IDX*8) + + ADDQ $8, IDX // i += 8 + DECQ LEN + JNZ loop // while --LEN > 0 + CMPQ TAIL, $0 + JE end // if TAIL == 0 { return } + +tail_start: // Reset loop registers + MOVQ TAIL, LEN // Loop counter: LEN = TAIL + SHRQ $1, LEN // LEN = floor( TAIL / 2 ) + JZ tail_one // if n == 0 goto end + +tail_two: // do { + MOVUPS (X_PTR)(IDX*8), X2 // X_i = x[i] + MULPD ALPHA, X2 // X_i *= ALPHA + MOVUPS X2, (X_PTR)(IDX*8) // x[i] = X_i + ADDQ $2, IDX // i += 2 + DECQ LEN + JNZ tail_two // while --LEN > 0 + + ANDQ $1, TAIL + JZ end // if TAIL == 0 { return } + +tail_one: + // x[i] *= alpha for the remaining element. + MOVSD (X_PTR)(IDX*8), X2 + MULSD ALPHA, X2 + MOVSD X2, (X_PTR)(IDX*8) + +end: + RET diff --git a/vendor/gonum.org/v1/gonum/internal/asm/f64/scalunitaryto_amd64.s b/vendor/gonum.org/v1/gonum/internal/asm/f64/scalunitaryto_amd64.s new file mode 100644 index 00000000..986480a5 --- /dev/null +++ b/vendor/gonum.org/v1/gonum/internal/asm/f64/scalunitaryto_amd64.s @@ -0,0 +1,113 @@ +// Copyright ©2016 The Gonum Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. +// +// Some of the loop unrolling code is copied from: +// http://golang.org/src/math/big/arith_amd64.s +// which is distributed under these terms: +// +// Copyright (c) 2012 The Go Authors. All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +// +build !noasm,!gccgo,!safe + +#include "textflag.h" + +#define MOVDDUP_ALPHA LONG $0x44120FF2; WORD $0x2024 // @ MOVDDUP 32(SP), X0 /*XMM0, 32[RSP]*/ + +#define X_PTR SI +#define DST_PTR DI +#define IDX AX +#define LEN CX +#define TAIL BX +#define ALPHA X0 +#define ALPHA_2 X1 + +// func ScalUnitaryTo(dst []float64, alpha float64, x []float64) +// This function assumes len(dst) >= len(x). +TEXT ·ScalUnitaryTo(SB), NOSPLIT, $0 + MOVQ x_base+32(FP), X_PTR // X_PTR = &x + MOVQ dst_base+0(FP), DST_PTR // DST_PTR = &dst + MOVDDUP_ALPHA // ALPHA = { alpha, alpha } + MOVQ x_len+40(FP), LEN // LEN = len(x) + CMPQ LEN, $0 + JE end // if LEN == 0 { return } + + XORQ IDX, IDX // IDX = 0 + MOVQ LEN, TAIL + ANDQ $7, TAIL // TAIL = LEN % 8 + SHRQ $3, LEN // LEN = floor( LEN / 8 ) + JZ tail_start // if LEN == 0 { goto tail_start } + + MOVUPS ALPHA, ALPHA_2 // ALPHA_2 = ALPHA for pipelining + +loop: // do { // dst[i] = alpha * x[i] unrolled 8x. + MOVUPS (X_PTR)(IDX*8), X2 // X_i = x[i] + MOVUPS 16(X_PTR)(IDX*8), X3 + MOVUPS 32(X_PTR)(IDX*8), X4 + MOVUPS 48(X_PTR)(IDX*8), X5 + + MULPD ALPHA, X2 // X_i *= ALPHA + MULPD ALPHA_2, X3 + MULPD ALPHA, X4 + MULPD ALPHA_2, X5 + + MOVUPS X2, (DST_PTR)(IDX*8) // dst[i] = X_i + MOVUPS X3, 16(DST_PTR)(IDX*8) + MOVUPS X4, 32(DST_PTR)(IDX*8) + MOVUPS X5, 48(DST_PTR)(IDX*8) + + ADDQ $8, IDX // i += 8 + DECQ LEN + JNZ loop // while --LEN > 0 + CMPQ TAIL, $0 + JE end // if TAIL == 0 { return } + +tail_start: // Reset loop counters + MOVQ TAIL, LEN // Loop counter: LEN = TAIL + SHRQ $1, LEN // LEN = floor( TAIL / 2 ) + JZ tail_one // if LEN == 0 { goto tail_one } + +tail_two: // do { + MOVUPS (X_PTR)(IDX*8), X2 // X_i = x[i] + MULPD ALPHA, X2 // X_i *= ALPHA + MOVUPS X2, (DST_PTR)(IDX*8) // dst[i] = X_i + ADDQ $2, IDX // i += 2 + DECQ LEN + JNZ tail_two // while --LEN > 0 + + ANDQ $1, TAIL + JZ end // if TAIL == 0 { return } + +tail_one: + MOVSD (X_PTR)(IDX*8), X2 // X_i = x[i] + MULSD ALPHA, X2 // X_i *= ALPHA + MOVSD X2, (DST_PTR)(IDX*8) // dst[i] = X_i + +end: + RET diff --git a/vendor/gonum.org/v1/gonum/internal/asm/f64/stubs_amd64.go b/vendor/gonum.org/v1/gonum/internal/asm/f64/stubs_amd64.go new file mode 100644 index 00000000..7139bedd --- /dev/null +++ b/vendor/gonum.org/v1/gonum/internal/asm/f64/stubs_amd64.go @@ -0,0 +1,277 @@ +// Copyright ©2015 The Gonum Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +//go:build !noasm && !gccgo && !safe +// +build !noasm,!gccgo,!safe + +package f64 + +// L1Norm is +// +// for _, v := range x { +// sum += math.Abs(v) +// } +// return sum +func L1Norm(x []float64) (sum float64) + +// L1NormInc is +// +// for i := 0; i < n*incX; i += incX { +// sum += math.Abs(x[i]) +// } +// return sum +func L1NormInc(x []float64, n, incX int) (sum float64) + +// AddConst is +// +// for i := range x { +// x[i] += alpha +// } +func AddConst(alpha float64, x []float64) + +// Add is +// +// for i, v := range s { +// dst[i] += v +// } +func Add(dst, s []float64) + +// AxpyUnitary is +// +// for i, v := range x { +// y[i] += alpha * v +// } +func AxpyUnitary(alpha float64, x, y []float64) + +// AxpyUnitaryTo is +// +// for i, v := range x { +// dst[i] = alpha*v + y[i] +// } +func AxpyUnitaryTo(dst []float64, alpha float64, x, y []float64) + +// AxpyInc is +// +// for i := 0; i < int(n); i++ { +// y[iy] += alpha * x[ix] +// ix += incX +// iy += incY +// } +func AxpyInc(alpha float64, x, y []float64, n, incX, incY, ix, iy uintptr) + +// AxpyIncTo is +// +// for i := 0; i < int(n); i++ { +// dst[idst] = alpha*x[ix] + y[iy] +// ix += incX +// iy += incY +// idst += incDst +// } +func AxpyIncTo(dst []float64, incDst, idst uintptr, alpha float64, x, y []float64, n, incX, incY, ix, iy uintptr) + +// CumSum is +// +// if len(s) == 0 { +// return dst +// } +// dst[0] = s[0] +// for i, v := range s[1:] { +// dst[i+1] = dst[i] + v +// } +// return dst +func CumSum(dst, s []float64) []float64 + +// CumProd is +// +// if len(s) == 0 { +// return dst +// } +// dst[0] = s[0] +// for i, v := range s[1:] { +// dst[i+1] = dst[i] * v +// } +// return dst +func CumProd(dst, s []float64) []float64 + +// Div is +// +// for i, v := range s { +// dst[i] /= v +// } +func Div(dst, s []float64) + +// DivTo is +// +// for i, v := range s { +// dst[i] = v / t[i] +// } +// return dst +func DivTo(dst, x, y []float64) []float64 + +// DotUnitary is +// +// for i, v := range x { +// sum += y[i] * v +// } +// return sum +func DotUnitary(x, y []float64) (sum float64) + +// DotInc is +// +// for i := 0; i < int(n); i++ { +// sum += y[iy] * x[ix] +// ix += incX +// iy += incY +// } +// return sum +func DotInc(x, y []float64, n, incX, incY, ix, iy uintptr) (sum float64) + +// L1Dist is +// +// var norm float64 +// for i, v := range s { +// norm += math.Abs(t[i] - v) +// } +// return norm +func L1Dist(s, t []float64) float64 + +// LinfDist is +// +// var norm float64 +// if len(s) == 0 { +// return 0 +// } +// norm = math.Abs(t[0] - s[0]) +// for i, v := range s[1:] { +// absDiff := math.Abs(t[i+1] - v) +// if absDiff > norm || math.IsNaN(norm) { +// norm = absDiff +// } +// } +// return norm +func LinfDist(s, t []float64) float64 + +// ScalUnitary is +// +// for i := range x { +// x[i] *= alpha +// } +func ScalUnitary(alpha float64, x []float64) + +// ScalUnitaryTo is +// +// for i, v := range x { +// dst[i] = alpha * v +// } +func ScalUnitaryTo(dst []float64, alpha float64, x []float64) + +// ScalInc is +// +// var ix uintptr +// for i := 0; i < int(n); i++ { +// x[ix] *= alpha +// ix += incX +// } +func ScalInc(alpha float64, x []float64, n, incX uintptr) + +// ScalIncTo is +// +// var idst, ix uintptr +// for i := 0; i < int(n); i++ { +// dst[idst] = alpha * x[ix] +// ix += incX +// idst += incDst +// } +func ScalIncTo(dst []float64, incDst uintptr, alpha float64, x []float64, n, incX uintptr) + +// Sum is +// +// var sum float64 +// for i := range x { +// sum += x[i] +// } +func Sum(x []float64) float64 + +// L2NormUnitary returns the L2-norm of x. +// +// var scale float64 +// sumSquares := 1.0 +// for _, v := range x { +// if v == 0 { +// continue +// } +// absxi := math.Abs(v) +// if math.IsNaN(absxi) { +// return math.NaN() +// } +// if scale < absxi { +// s := scale / absxi +// sumSquares = 1 + sumSquares*s*s +// scale = absxi +// } else { +// s := absxi / scale +// sumSquares += s * s +// } +// if math.IsInf(scale, 1) { +// return math.Inf(1) +// } +// } +// return scale * math.Sqrt(sumSquares) +func L2NormUnitary(x []float64) (norm float64) + +// L2NormInc returns the L2-norm of x. +// +// var scale float64 +// sumSquares := 1.0 +// for ix := uintptr(0); ix < n*incX; ix += incX { +// val := x[ix] +// if val == 0 { +// continue +// } +// absxi := math.Abs(val) +// if math.IsNaN(absxi) { +// return math.NaN() +// } +// if scale < absxi { +// s := scale / absxi +// sumSquares = 1 + sumSquares*s*s +// scale = absxi +// } else { +// s := absxi / scale +// sumSquares += s * s +// } +// } +// if math.IsInf(scale, 1) { +// return math.Inf(1) +// } +// return scale * math.Sqrt(sumSquares) +func L2NormInc(x []float64, n, incX uintptr) (norm float64) + +// L2DistanceUnitary returns the L2-norm of x-y. +// +// var scale float64 +// sumSquares := 1.0 +// for i, v := range x { +// v -= y[i] +// if v == 0 { +// continue +// } +// absxi := math.Abs(v) +// if math.IsNaN(absxi) { +// return math.NaN() +// } +// if scale < absxi { +// s := scale / absxi +// sumSquares = 1 + sumSquares*s*s +// scale = absxi +// } else { +// s := absxi / scale +// sumSquares += s * s +// } +// } +// if math.IsInf(scale, 1) { +// return math.Inf(1) +// } +// return scale * math.Sqrt(sumSquares) +func L2DistanceUnitary(x, y []float64) (norm float64) diff --git a/vendor/gonum.org/v1/gonum/internal/asm/f64/stubs_noasm.go b/vendor/gonum.org/v1/gonum/internal/asm/f64/stubs_noasm.go new file mode 100644 index 00000000..f0663791 --- /dev/null +++ b/vendor/gonum.org/v1/gonum/internal/asm/f64/stubs_noasm.go @@ -0,0 +1,182 @@ +// Copyright ©2016 The Gonum Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +//go:build !amd64 || noasm || gccgo || safe +// +build !amd64 noasm gccgo safe + +package f64 + +import "math" + +// L1Norm is +// +// for _, v := range x { +// sum += math.Abs(v) +// } +// return sum +func L1Norm(x []float64) (sum float64) { + for _, v := range x { + sum += math.Abs(v) + } + return sum +} + +// L1NormInc is +// +// for i := 0; i < n*incX; i += incX { +// sum += math.Abs(x[i]) +// } +// return sum +func L1NormInc(x []float64, n, incX int) (sum float64) { + for i := 0; i < n*incX; i += incX { + sum += math.Abs(x[i]) + } + return sum +} + +// Add is +// +// for i, v := range s { +// dst[i] += v +// } +func Add(dst, s []float64) { + for i, v := range s { + dst[i] += v + } +} + +// AddConst is +// +// for i := range x { +// x[i] += alpha +// } +func AddConst(alpha float64, x []float64) { + for i := range x { + x[i] += alpha + } +} + +// CumSum is +// +// if len(s) == 0 { +// return dst +// } +// dst[0] = s[0] +// for i, v := range s[1:] { +// dst[i+1] = dst[i] + v +// } +// return dst +func CumSum(dst, s []float64) []float64 { + if len(s) == 0 { + return dst + } + dst[0] = s[0] + for i, v := range s[1:] { + dst[i+1] = dst[i] + v + } + return dst +} + +// CumProd is +// +// if len(s) == 0 { +// return dst +// } +// dst[0] = s[0] +// for i, v := range s[1:] { +// dst[i+1] = dst[i] * v +// } +// return dst +func CumProd(dst, s []float64) []float64 { + if len(s) == 0 { + return dst + } + dst[0] = s[0] + for i, v := range s[1:] { + dst[i+1] = dst[i] * v + } + return dst +} + +// Div is +// +// for i, v := range s { +// dst[i] /= v +// } +func Div(dst, s []float64) { + for i, v := range s { + dst[i] /= v + } +} + +// DivTo is +// +// for i, v := range s { +// dst[i] = v / t[i] +// } +// return dst +func DivTo(dst, s, t []float64) []float64 { + for i, v := range s { + dst[i] = v / t[i] + } + return dst +} + +// L1Dist is +// +// var norm float64 +// for i, v := range s { +// norm += math.Abs(t[i] - v) +// } +// return norm +func L1Dist(s, t []float64) float64 { + var norm float64 + for i, v := range s { + norm += math.Abs(t[i] - v) + } + return norm +} + +// LinfDist is +// +// var norm float64 +// if len(s) == 0 { +// return 0 +// } +// norm = math.Abs(t[0] - s[0]) +// for i, v := range s[1:] { +// absDiff := math.Abs(t[i+1] - v) +// if absDiff > norm || math.IsNaN(norm) { +// norm = absDiff +// } +// } +// return norm +func LinfDist(s, t []float64) float64 { + var norm float64 + if len(s) == 0 { + return 0 + } + norm = math.Abs(t[0] - s[0]) + for i, v := range s[1:] { + absDiff := math.Abs(t[i+1] - v) + if absDiff > norm || math.IsNaN(norm) { + norm = absDiff + } + } + return norm +} + +// Sum is +// +// var sum float64 +// for i := range x { +// sum += x[i] +// } +func Sum(x []float64) float64 { + var sum float64 + for _, v := range x { + sum += v + } + return sum +} diff --git a/vendor/gonum.org/v1/gonum/internal/asm/f64/sum_amd64.s b/vendor/gonum.org/v1/gonum/internal/asm/f64/sum_amd64.s new file mode 100644 index 00000000..dd77cbd0 --- /dev/null +++ b/vendor/gonum.org/v1/gonum/internal/asm/f64/sum_amd64.s @@ -0,0 +1,99 @@ +// Copyright ©2018 The Gonum Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// +build !noasm,!gccgo,!safe + +#include "textflag.h" + +#define X_PTR SI +#define IDX AX +#define LEN CX +#define TAIL BX +#define SUM X0 +#define SUM_1 X1 +#define SUM_2 X2 +#define SUM_3 X3 + +// func Sum(x []float64) float64 +TEXT ·Sum(SB), NOSPLIT, $0 + MOVQ x_base+0(FP), X_PTR // X_PTR = &x + MOVQ x_len+8(FP), LEN // LEN = len(x) + XORQ IDX, IDX // i = 0 + PXOR SUM, SUM // p_sum_i = 0 + CMPQ LEN, $0 // if LEN == 0 { return 0 } + JE sum_end + + PXOR SUM_1, SUM_1 + PXOR SUM_2, SUM_2 + PXOR SUM_3, SUM_3 + + MOVQ X_PTR, TAIL // Check memory alignment + ANDQ $15, TAIL // TAIL = &y % 16 + JZ no_trim // if TAIL == 0 { goto no_trim } + + // Align on 16-byte boundary + ADDSD (X_PTR), X0 // X0 += x[0] + INCQ IDX // i++ + DECQ LEN // LEN-- + JZ sum_end // if LEN == 0 { return } + +no_trim: + MOVQ LEN, TAIL + SHRQ $4, LEN // LEN = floor( n / 16 ) + JZ sum_tail8 // if LEN == 0 { goto sum_tail8 } + +sum_loop: // sum 16x wide do { + ADDPD (X_PTR)(IDX*8), SUM // sum_i += x[i:i+2] + ADDPD 16(X_PTR)(IDX*8), SUM_1 + ADDPD 32(X_PTR)(IDX*8), SUM_2 + ADDPD 48(X_PTR)(IDX*8), SUM_3 + ADDPD 64(X_PTR)(IDX*8), SUM + ADDPD 80(X_PTR)(IDX*8), SUM_1 + ADDPD 96(X_PTR)(IDX*8), SUM_2 + ADDPD 112(X_PTR)(IDX*8), SUM_3 + ADDQ $16, IDX // i += 16 + DECQ LEN + JNZ sum_loop // } while --LEN > 0 + +sum_tail8: + TESTQ $8, TAIL + JZ sum_tail4 + + ADDPD (X_PTR)(IDX*8), SUM // sum_i += x[i:i+2] + ADDPD 16(X_PTR)(IDX*8), SUM_1 + ADDPD 32(X_PTR)(IDX*8), SUM_2 + ADDPD 48(X_PTR)(IDX*8), SUM_3 + ADDQ $8, IDX + +sum_tail4: + ADDPD SUM_3, SUM + ADDPD SUM_2, SUM_1 + + TESTQ $4, TAIL + JZ sum_tail2 + + ADDPD (X_PTR)(IDX*8), SUM // sum_i += x[i:i+2] + ADDPD 16(X_PTR)(IDX*8), SUM_1 + ADDQ $4, IDX + +sum_tail2: + ADDPD SUM_1, SUM + + TESTQ $2, TAIL + JZ sum_tail1 + + ADDPD (X_PTR)(IDX*8), SUM // sum_i += x[i:i+2] + ADDQ $2, IDX + +sum_tail1: + HADDPD SUM, SUM // sum_i[0] += sum_i[1] + + TESTQ $1, TAIL + JZ sum_end + + ADDSD (X_PTR)(IDX*8), SUM + +sum_end: // return sum + MOVSD SUM, ret+24(FP) + RET diff --git a/vendor/modules.txt b/vendor/modules.txt index 6f6aadb4..77ca4a24 100644 --- a/vendor/modules.txt +++ b/vendor/modules.txt @@ -985,6 +985,11 @@ golang.org/x/tools/internal/typeparams # gomodules.xyz/jsonpatch/v2 v2.2.0 ## explicit; go 1.12 gomodules.xyz/jsonpatch/v2 +# gonum.org/v1/gonum v0.14.0 +## explicit; go 1.20 +gonum.org/v1/gonum/floats +gonum.org/v1/gonum/floats/scalar +gonum.org/v1/gonum/internal/asm/f64 # google.golang.org/appengine v1.6.7 ## explicit; go 1.11 google.golang.org/appengine