From dc0a0ac27c22c2ea3b705a513a626dac8bc227a2 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Thu, 10 Dec 2020 01:21:37 -0800 Subject: [PATCH 001/158] Added tests for more versions of Active Record --- .github/workflows/build.yml | 14 +++++++++++++- gemfiles/activerecord52.gemfile | 11 +++++++++++ gemfiles/activerecord60.gemfile | 11 +++++++++++ 3 files changed, 35 insertions(+), 1 deletion(-) create mode 100644 gemfiles/activerecord52.gemfile create mode 100644 gemfiles/activerecord60.gemfile diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index aa71e74..179a6ee 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -3,7 +3,19 @@ on: [push, pull_request] jobs: build: if: "!contains(github.event.head_commit.message, '[skip ci]')" - runs-on: ubuntu-latest + strategy: + fail-fast: false + matrix: + include: + - ruby: 2.7 + gemfile: Gemfile + - ruby: 2.6 + gemfile: gemfiles/activerecord60.gemfile + - ruby: 2.5 + gemfile: gemfiles/activerecord52.gemfile + runs-on: ubuntu-20.04 + env: + BUNDLE_GEMFILE: ${{ matrix.gemfile }} steps: - uses: actions/checkout@v2 - uses: ruby/setup-ruby@v1 diff --git a/gemfiles/activerecord52.gemfile b/gemfiles/activerecord52.gemfile new file mode 100644 index 0000000..939f2e0 --- /dev/null +++ b/gemfiles/activerecord52.gemfile @@ -0,0 +1,11 @@ +source "https://rubygems.org" + +gemspec path: ".." + +gem "rake" +gem "minitest", ">= 5" +gem "activerecord", "~> 5.2.0" +gem "sqlite3" +gem "daru" +gem "rover-df" +gem "ngt", ">= 0.3.0" diff --git a/gemfiles/activerecord60.gemfile b/gemfiles/activerecord60.gemfile new file mode 100644 index 0000000..8a4ea25 --- /dev/null +++ b/gemfiles/activerecord60.gemfile @@ -0,0 +1,11 @@ +source "https://rubygems.org" + +gemspec path: ".." + +gem "rake" +gem "minitest", ">= 5" +gem "activerecord", "~> 6.0.0" +gem "sqlite3" +gem "daru" +gem "rover-df" +gem "ngt", ">= 0.3.0" From 3eef8e355663df426360f27e54b4bd49237ecaeb Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sat, 26 Dec 2020 18:59:18 -0800 Subject: [PATCH 002/158] Test with Ruby 3 --- .github/workflows/build.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 179a6ee..1979a37 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -7,11 +7,11 @@ jobs: fail-fast: false matrix: include: - - ruby: 2.7 + - ruby: 3.0 gemfile: Gemfile - - ruby: 2.6 + - ruby: 2.7 gemfile: gemfiles/activerecord60.gemfile - - ruby: 2.5 + - ruby: 2.6 gemfile: gemfiles/activerecord52.gemfile runs-on: ubuntu-20.04 env: From b2341dff988103306e2226b2fd29c64f279ccea8 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Fri, 22 Jan 2021 03:17:51 -0800 Subject: [PATCH 003/158] Updated readme [skip ci] --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index e9309ed..16819fb 100644 --- a/README.md +++ b/README.md @@ -46,13 +46,13 @@ recommender.fit([ > Use `value` instead of rating for implicit feedback -Get user-based (user-item) recommendations - “users like you also liked” +Get user-based recommendations - “users like you also liked” ```ruby recommender.user_recs(user_id) ``` -Get item-based (item-item) recommendations - “users who liked this item also liked” +Get item-based recommendations - “users who liked this item also liked” ```ruby recommender.item_recs(item_id) From b6f5416e616bf7d954362ce6fc246dba6f184814 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Tue, 26 Jan 2021 18:17:01 -0800 Subject: [PATCH 004/158] Removed unnecessary sorting --- lib/disco/recommender.rb | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/lib/disco/recommender.rb b/lib/disco/recommender.rb index cdea4e0..2da4498 100644 --- a/lib/disco/recommender.rb +++ b/lib/disco/recommender.rb @@ -6,6 +6,8 @@ def initialize(factors: 8, epochs: 20, verbose: nil) @factors = factors @epochs = epochs @verbose = verbose + @user_map = {} + @item_map = {} end def fit(train_set, validation_set: nil) @@ -26,7 +28,7 @@ def fit(train_set, validation_set: nil) end check_training_set(train_set) - create_maps(train_set) + update_maps(train_set) @rated = Hash.new { |hash, key| hash[key] = {} } input = [] @@ -211,15 +213,14 @@ def similar(id, map, factors, norms, count, index) end end - def create_maps(train_set) - user_ids = train_set.map { |v| v[:user_id] }.uniq.sort - item_ids = train_set.map { |v| v[:item_id] }.uniq.sort + def update_maps(train_set) + raise ArgumentError, "Missing user_id" if train_set.any? { |v| v[:user_id].nil? } + raise ArgumentError, "Missing item_id" if train_set.any? { |v| v[:item_id].nil? } - raise ArgumentError, "Missing user_id" if user_ids.any?(&:nil?) - raise ArgumentError, "Missing item_id" if item_ids.any?(&:nil?) - - @user_map = user_ids.zip(user_ids.size.times).to_h - @item_map = item_ids.zip(item_ids.size.times).to_h + train_set.each do |v| + @user_map[v[:user_id]] ||= @user_map.size + @item_map[v[:item_id]] ||= @item_map.size + end end def check_ratings(ratings) From 80c4882ee29071c3c4f1fab23c0f84f93826cf31 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Tue, 26 Jan 2021 19:53:23 -0800 Subject: [PATCH 005/158] Removed norms calculation for optimized similarity --- lib/disco/recommender.rb | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/disco/recommender.rb b/lib/disco/recommender.rb index 2da4498..47e32f0 100644 --- a/lib/disco/recommender.rb +++ b/lib/disco/recommender.rb @@ -145,13 +145,13 @@ def optimize_similar_users def similar_items(item_id, count: 5) check_fit - similar(item_id, @item_map, @item_factors, item_norms, count, @item_index) + similar(item_id, @item_map, @item_factors, @item_index ? nil : item_norms, count, @item_index) end alias_method :item_recs, :similar_items def similar_users(user_id, count: 5) check_fit - similar(user_id, @user_map, @user_factors, user_norms, count, @user_index) + similar(user_id, @user_map, @user_factors, @user_index ? nil : user_norms, count, @user_index) end private From 7360d759209078288be811491d9f220b2cf67f3b Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Tue, 26 Jan 2021 22:08:46 -0800 Subject: [PATCH 006/158] Added safety check for optimized similarity --- lib/disco/recommender.rb | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/lib/disco/recommender.rb b/lib/disco/recommender.rb index 47e32f0..96b7a60 100644 --- a/lib/disco/recommender.rb +++ b/lib/disco/recommender.rb @@ -160,7 +160,8 @@ def create_index(factors) require "ngt" index = Ngt::Index.new(factors.shape[1], distance_type: "Cosine") - index.batch_insert(factors) + ids = index.batch_insert(factors) + raise "Unexpected ids. Please report a bug." if ids.first != 1 || ids.last != factors.shape[0] index end From 7b611bca5fca37d9a3a15e5c64a21c30aa633491 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Wed, 27 Jan 2021 18:37:42 -0800 Subject: [PATCH 007/158] Use minmax --- lib/disco/recommender.rb | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/lib/disco/recommender.rb b/lib/disco/recommender.rb index 96b7a60..11963b5 100644 --- a/lib/disco/recommender.rb +++ b/lib/disco/recommender.rb @@ -19,8 +19,7 @@ def fit(train_set, validation_set: nil) unless @implicit ratings = train_set.map { |o| o[:rating] } check_ratings(ratings) - @min_rating = ratings.min - @max_rating = ratings.max + @min_rating, @max_rating = ratings.minmax if validation_set check_ratings(validation_set.map { |o| o[:rating] }) From 9189dcbe148a41d3293cecad0c00c83cc2b0c03b Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Wed, 27 Jan 2021 18:41:06 -0800 Subject: [PATCH 008/158] Reduced allocations --- lib/disco/recommender.rb | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/lib/disco/recommender.rb b/lib/disco/recommender.rb index 11963b5..78c533d 100644 --- a/lib/disco/recommender.rb +++ b/lib/disco/recommender.rb @@ -14,19 +14,18 @@ def fit(train_set, validation_set: nil) train_set = to_dataset(train_set) validation_set = to_dataset(validation_set) if validation_set - @implicit = !train_set.any? { |v| v[:rating] } + check_training_set(train_set) + @implicit = !train_set.any? { |v| v[:rating] } unless @implicit - ratings = train_set.map { |o| o[:rating] } - check_ratings(ratings) - @min_rating, @max_rating = ratings.minmax + check_ratings(train_set) + @min_rating, @max_rating = train_set.minmax_by { |o| o[:rating] }.map { |o| o[:rating] } if validation_set - check_ratings(validation_set.map { |o| o[:rating] }) + check_ratings(validation_set) end end - check_training_set(train_set) update_maps(train_set) @rated = Hash.new { |hash, key| hash[key] = {} } @@ -224,10 +223,10 @@ def update_maps(train_set) end def check_ratings(ratings) - unless ratings.all? { |r| !r.nil? } + unless ratings.all? { |r| !r[:rating].nil? } raise ArgumentError, "Missing ratings" end - unless ratings.all? { |r| r.is_a?(Numeric) } + unless ratings.all? { |r| r[:rating].is_a?(Numeric) } raise ArgumentError, "Ratings must be numeric" end end From d24ea8e9863198390390074413b187a08dd17074 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Fri, 29 Jan 2021 11:09:02 -0800 Subject: [PATCH 009/158] Use inner for consistency --- lib/disco/recommender.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/disco/recommender.rb b/lib/disco/recommender.rb index 78c533d..0231d6f 100644 --- a/lib/disco/recommender.rb +++ b/lib/disco/recommender.rb @@ -192,7 +192,7 @@ def similar(id, map, factors, norms, count, index) } end else - predictions = factors.dot(factors[i, true]) / norms + predictions = factors.inner(factors[i, true]) / norms predictions = map.keys.zip(predictions).map do |item_id, pred| From 2dbd0a84117899356f7beae88f5307e2ab78fd7f Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Fri, 29 Jan 2021 11:57:36 -0800 Subject: [PATCH 010/158] Added comment [skip ci] --- lib/disco/recommender.rb | 1 + 1 file changed, 1 insertion(+) diff --git a/lib/disco/recommender.rb b/lib/disco/recommender.rb index 0231d6f..482cecb 100644 --- a/lib/disco/recommender.rb +++ b/lib/disco/recommender.rb @@ -192,6 +192,7 @@ def similar(id, map, factors, norms, count, index) } end else + # cosine similarity predictions = factors.inner(factors[i, true]) / norms predictions = From eb46ba0402879778578310c41dda1a172badd95c Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Fri, 29 Jan 2021 12:49:38 -0800 Subject: [PATCH 011/158] Fixed similarity calculation --- CHANGELOG.md | 4 ++++ lib/disco/recommender.rb | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 614fe42..bd4588d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,7 @@ +## 0.2.4 (unreleased) + +- Fixed similarity calculation + ## 0.2.3 (2020-11-28) - Added `predict` method diff --git a/lib/disco/recommender.rb b/lib/disco/recommender.rb index 482cecb..1576dee 100644 --- a/lib/disco/recommender.rb +++ b/lib/disco/recommender.rb @@ -193,7 +193,7 @@ def similar(id, map, factors, norms, count, index) end else # cosine similarity - predictions = factors.inner(factors[i, true]) / norms + predictions = factors.inner(factors[i, true]) / (norms[i] * norms) predictions = map.keys.zip(predictions).map do |item_id, pred| From 0ec80cd88fa7f7572db54b321d0d5da153029f84 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Fri, 29 Jan 2021 13:34:20 -0800 Subject: [PATCH 012/158] Reverted calculation, but use norms[i] instead of max score --- CHANGELOG.md | 4 ---- lib/disco/recommender.rb | 8 ++++---- 2 files changed, 4 insertions(+), 8 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index bd4588d..614fe42 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,7 +1,3 @@ -## 0.2.4 (unreleased) - -- Fixed similarity calculation - ## 0.2.3 (2020-11-28) - Added `predict` method diff --git a/lib/disco/recommender.rb b/lib/disco/recommender.rb index 1576dee..b1103c0 100644 --- a/lib/disco/recommender.rb +++ b/lib/disco/recommender.rb @@ -192,8 +192,8 @@ def similar(id, map, factors, norms, count, index) } end else - # cosine similarity - predictions = factors.inner(factors[i, true]) / (norms[i] * norms) + # cosine similarity without norms[i] + predictions = factors.inner(factors[i, true]) / norms predictions = map.keys.zip(predictions).map do |item_id, pred| @@ -203,9 +203,9 @@ def similar(id, map, factors, norms, count, index) max_score = predictions.delete_at(i)[:score] predictions.sort_by! { |pred| -pred[:score] } # already sorted by id predictions = predictions.first(count) if count - # divide by max score to get cosine similarity + # divide by norms[i] to get cosine similarity # only need to do for returned records - predictions.each { |pred| pred[:score] /= max_score } + predictions.each { |pred| pred[:score] /= norms[i] } predictions end else From 6b4d91a2cf82db90ed47cae6b1a7d2a73a02abac Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Fri, 29 Jan 2021 13:39:20 -0800 Subject: [PATCH 013/158] Updated comment [skip ci] --- lib/disco/recommender.rb | 1 + 1 file changed, 1 insertion(+) diff --git a/lib/disco/recommender.rb b/lib/disco/recommender.rb index b1103c0..8cf1d8a 100644 --- a/lib/disco/recommender.rb +++ b/lib/disco/recommender.rb @@ -193,6 +193,7 @@ def similar(id, map, factors, norms, count, index) end else # cosine similarity without norms[i] + # otherwise, denominator would be (norms[i] * norms) predictions = factors.inner(factors[i, true]) / norms predictions = From dce5855e48b48b18dde16365c3385151111d1de3 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sat, 30 Jan 2021 20:30:15 -0800 Subject: [PATCH 014/158] Improved optimize_similar_items test [skip ci] --- test/recommender_test.rb | 3 +++ 1 file changed, 3 insertions(+) diff --git a/test/recommender_test.rb b/test/recommender_test.rb index b4890f6..a327b10 100644 --- a/test/recommender_test.rb +++ b/test/recommender_test.rb @@ -255,9 +255,12 @@ def test_optimize_similar_items recommender = Disco::Recommender.new(factors: 20) recommender.fit(data) + original_recs = recommender.item_recs("Star Wars (1977)") + recommender.optimize_similar_items recs = recommender.item_recs("Star Wars (1977)") + assert_equal original_recs.map { |v| v[:item_id] }, recs.map { |v| v[:item_id] } assert_equal 5, recs.size item_ids = recs.map { |r| r[:item_id] } From ca8257e5756bdc85e270e0ffdeb1afd948a707cc Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sat, 30 Jan 2021 20:32:54 -0800 Subject: [PATCH 015/158] Test score --- test/recommender_test.rb | 3 +++ 1 file changed, 3 insertions(+) diff --git a/test/recommender_test.rb b/test/recommender_test.rb index a327b10..d26c043 100644 --- a/test/recommender_test.rb +++ b/test/recommender_test.rb @@ -261,6 +261,9 @@ def test_optimize_similar_items recs = recommender.item_recs("Star Wars (1977)") assert_equal original_recs.map { |v| v[:item_id] }, recs.map { |v| v[:item_id] } + original_recs.zip(recs).each do |exp, act| + assert_in_delta exp[:score], act[:score] + end assert_equal 5, recs.size item_ids = recs.map { |r| r[:item_id] } From 797edc97deb833883b20158fcbb8dd56983b92a6 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sat, 30 Jan 2021 20:36:45 -0800 Subject: [PATCH 016/158] Added comment [skip ci] --- lib/disco/recommender.rb | 2 ++ 1 file changed, 2 insertions(+) diff --git a/lib/disco/recommender.rb b/lib/disco/recommender.rb index 8cf1d8a..a6cda62 100644 --- a/lib/disco/recommender.rb +++ b/lib/disco/recommender.rb @@ -157,6 +157,8 @@ def similar_users(user_id, count: 5) def create_index(factors) require "ngt" + # could speed up search with normalized cosine + # https://github.com/yahoojapan/NGT/issues/36 index = Ngt::Index.new(factors.shape[1], distance_type: "Cosine") ids = index.batch_insert(factors) raise "Unexpected ids. Please report a bug." if ids.first != 1 || ids.last != factors.shape[0] From 109ee0e47fe467c92cbfabd0b13c085287a90135 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Mon, 15 Feb 2021 18:59:06 -0800 Subject: [PATCH 017/158] Fixed warning --- Rakefile | 2 +- lib/disco/recommender.rb | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/Rakefile b/Rakefile index 35dafa0..981f476 100644 --- a/Rakefile +++ b/Rakefile @@ -5,5 +5,5 @@ task default: :test Rake::TestTask.new do |t| t.libs << "test" t.pattern = "test/**/*_test.rb" - t.warning = false + t.warning = false # for daru end diff --git a/lib/disco/recommender.rb b/lib/disco/recommender.rb index a6cda62..392a734 100644 --- a/lib/disco/recommender.rb +++ b/lib/disco/recommender.rb @@ -203,7 +203,7 @@ def similar(id, map, factors, norms, count, index) {item_id: item_id, score: pred} end - max_score = predictions.delete_at(i)[:score] + predictions.delete_at(i) predictions.sort_by! { |pred| -pred[:score] } # already sorted by id predictions = predictions.first(count) if count # divide by norms[i] to get cosine similarity From 73ee6ef9e22d62b1a508cb7f92d24f4bac6f6c1d Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Mon, 15 Feb 2021 20:01:31 -0800 Subject: [PATCH 018/158] Added methods to get ids and to get factors for specific users and items --- CHANGELOG.md | 6 ++++++ README.md | 16 +++++++++++++++- lib/disco/recommender.rb | 28 +++++++++++++++++++++++++++- test/recommender_test.rb | 31 +++++++++++++++++++++++++++++++ 4 files changed, 79 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 614fe42..474cc9c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,9 @@ +## 0.2.4 (unreleased) + +- Added `user_ids` and `item_ids` methods +- Added `user_id` argument to `user_factors` +- Added `item_id` argument to `item_factors` + ## 0.2.3 (2020-11-28) - Added `predict` method diff --git a/README.md b/README.md index 16819fb..bc3b403 100644 --- a/README.md +++ b/README.md @@ -283,19 +283,33 @@ This should be called after fitting or loading the model. ## Reference +Get ids + +```ruby +recommender.user_ids +recommender.item_ids +``` + Get the global mean ```ruby recommender.global_mean ``` -Get the factors +Get factors ```ruby recommender.user_factors recommender.item_factors ``` +Get factors for specific users and items + +```ruby +recommender.user_factors(user_id) +recommender.item_factors(item_id) +``` + ## Credits Thanks to: diff --git a/lib/disco/recommender.rb b/lib/disco/recommender.rb index 392a734..6663170 100644 --- a/lib/disco/recommender.rb +++ b/lib/disco/recommender.rb @@ -1,6 +1,6 @@ module Disco class Recommender - attr_reader :global_mean, :item_factors, :user_factors + attr_reader :global_mean def initialize(factors: 8, epochs: 20, verbose: nil) @factors = factors @@ -152,6 +152,32 @@ def similar_users(user_id, count: 5) similar(user_id, @user_map, @user_factors, @user_index ? nil : user_norms, count, @user_index) end + def user_ids + @user_map.keys + end + + def item_ids + @item_map.keys + end + + def user_factors(user_id = nil) + if user_id + u = @user_map[user_id] + @user_factors[u, true] if u + else + @user_factors + end + end + + def item_factors(item_id = nil) + if item_id + i = @item_map[item_id] + @item_factors[i, true] if i + else + @item_factors + end + end + private def create_index(factors) diff --git a/test/recommender_test.rb b/test/recommender_test.rb index d26c043..1c3b00f 100644 --- a/test/recommender_test.rb +++ b/test/recommender_test.rb @@ -89,6 +89,37 @@ def test_rated assert_equal ["A", "B"], recommender.user_recs(2).map { |r| r[:item_id] }.sort end + def test_ids + data = [ + {user_id: 1, item_id: "A"}, + {user_id: 1, item_id: "B"}, + {user_id: 2, item_id: "B"} + ] + recommender = Disco::Recommender.new + recommender.fit(data) + assert_equal [1, 2], recommender.user_ids + assert_equal ["A", "B"], recommender.item_ids + end + + def test_factors + data = [ + {user_id: 1, item_id: "A"}, + {user_id: 1, item_id: "B"}, + {user_id: 2, item_id: "B"} + ] + recommender = Disco::Recommender.new(factors: 20) + recommender.fit(data) + + assert_equal [2, 20], recommender.user_factors.shape + assert_equal [2, 20], recommender.item_factors.shape + + assert_equal [20], recommender.user_factors(1).shape + assert_equal [20], recommender.item_factors("A").shape + + assert_nil recommender.user_factors(3) + assert_nil recommender.item_factors("C") + end + def test_validation_set_explicit data = Disco.load_movielens train_set = data.first(80000) From 4126bca18279f259053a8f9e73f9f5b5f949400c Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Mon, 15 Feb 2021 20:17:07 -0800 Subject: [PATCH 019/158] Version bump to 0.2.4 [skip ci] --- CHANGELOG.md | 2 +- LICENSE.txt | 2 +- disco.gemspec | 2 +- lib/disco/version.rb | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 474cc9c..91649f4 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,4 @@ -## 0.2.4 (unreleased) +## 0.2.4 (2021-02-15) - Added `user_ids` and `item_ids` methods - Added `user_id` argument to `user_factors` diff --git a/LICENSE.txt b/LICENSE.txt index e2e1d2a..bc58858 100644 --- a/LICENSE.txt +++ b/LICENSE.txt @@ -1,4 +1,4 @@ -Copyright (c) 2019-2020 Andrew Kane +Copyright (c) 2019-2021 Andrew Kane MIT License diff --git a/disco.gemspec b/disco.gemspec index d851258..cdebe15 100644 --- a/disco.gemspec +++ b/disco.gemspec @@ -8,7 +8,7 @@ Gem::Specification.new do |spec| spec.license = "MIT" spec.author = "Andrew Kane" - spec.email = "andrew@chartkick.com" + spec.email = "andrew@ankane.org" spec.files = Dir["*.{md,txt}", "{app,lib}/**/*"] spec.require_path = "lib" diff --git a/lib/disco/version.rb b/lib/disco/version.rb index 3c82993..a330ec9 100644 --- a/lib/disco/version.rb +++ b/lib/disco/version.rb @@ -1,3 +1,3 @@ module Disco - VERSION = "0.2.3" + VERSION = "0.2.4" end From b16cbc20bfa795df01eeb8f2bef22d413e9a7d21 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Mon, 15 Feb 2021 21:00:41 -0800 Subject: [PATCH 020/158] Added note about storing factors [skip ci] --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index bc3b403..6b0b3c4 100644 --- a/README.md +++ b/README.md @@ -201,6 +201,8 @@ bin = File.binread("recommender.bin") recommender = Marshal.load(bin) ``` +Alternatively, you can store only the factors and use a library like [Neighbor](https://github.com/ankane/neighbor) + ## Algorithms Disco uses high-performance matrix factorization. From 98b92eed4768bf6afca28eee223a93e5ad134f39 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Fri, 19 Feb 2021 18:53:45 -0800 Subject: [PATCH 021/158] Added optimize_similar_users method --- CHANGELOG.md | 4 +++ Gemfile | 1 + README.md | 20 +++++++++++- lib/disco/recommender.rb | 67 +++++++++++++++++++++++++++------------- test/recommender_test.rb | 19 ++++++++++++ 5 files changed, 89 insertions(+), 22 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 91649f4..2a978a1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,7 @@ +## 0.2.5 (unreleased) + +- Added `optimize_similar_users` method + ## 0.2.4 (2021-02-15) - Added `user_ids` and `item_ids` methods diff --git a/Gemfile b/Gemfile index 9d4d5b4..4cb7fc2 100644 --- a/Gemfile +++ b/Gemfile @@ -9,3 +9,4 @@ gem "sqlite3" gem "daru" gem "rover-df" gem "ngt", ">= 0.3.0" +gem "faiss" diff --git a/README.md b/README.md index 6b0b3c4..bbbeddd 100644 --- a/README.md +++ b/README.md @@ -259,7 +259,25 @@ Or a Daru data frame Daru::DataFrame.from_csv("ratings.csv") ``` -## Faster Similarity +## Faster User-Based Recommendations [master, experimental] + +If you have a large number of users/items, you can use an approximate nearest neighbors library like [Faiss](https://github.com/ankane/faiss) to speed up user-based recommendations. + +Add this line to your application’s Gemfile: + +```ruby +gem 'faiss' +``` + +Speed up user-based recommendations with: + +```ruby +model.optimize_user_recs +``` + +This should be called after fitting or loading the model. + +## Faster Item-Based Recommendations and Similar Users If you have a large number of users/items, you can use an approximate nearest neighbors library like [NGT](https://github.com/ankane/ngt) to speed up item-based recommendations and similar users. diff --git a/lib/disco/recommender.rb b/lib/disco/recommender.rb index 6663170..103059d 100644 --- a/lib/disco/recommender.rb +++ b/lib/disco/recommender.rb @@ -95,34 +95,43 @@ def user_recs(user_id, count: 5, item_ids: nil) u = @user_map[user_id] if u - predictions = @item_factors.inner(@user_factors[u, true]) + if @user_recs_index && count + distances, ids = @user_recs_index.search(@user_factors[u, true].expand_dims(0), count + @rated[u].size).map { |v| v.to_a[0] } + distances.map! { |v| v < @min_rating ? @min_rating : (v > @max_rating ? @max_rating : v) } if @min_rating + keys = @item_map.keys + ids.zip(distances).reject { |item_id, _| @rated[u][item_id] }.map do |item_id, distance| + {item_id: keys[item_id], score: distance} + end.first(count) + else + predictions = @item_factors.inner(@user_factors[u, true]) - predictions = - @item_map.keys.zip(predictions).map do |item_id, pred| - {item_id: item_id, score: pred} - end + predictions = + @item_map.keys.zip(predictions).map do |item_id, pred| + {item_id: item_id, score: pred} + end - if item_ids - idx = item_ids.map { |i| @item_map[i] }.compact - predictions = predictions.values_at(*idx) - else - @rated[u].keys.sort_by { |v| -v }.each do |i| - predictions.delete_at(i) + if item_ids + idx = item_ids.map { |i| @item_map[i] }.compact + predictions = predictions.values_at(*idx) + else + @rated[u].keys.sort_by { |v| -v }.each do |i| + predictions.delete_at(i) + end end - end - predictions.sort_by! { |pred| -pred[:score] } # already sorted by id - predictions = predictions.first(count) if count && !item_ids + predictions.sort_by! { |pred| -pred[:score] } # already sorted by id + predictions = predictions.first(count) if count && !item_ids - # clamp *after* sorting - # also, only needed for returned predictions - if @min_rating - predictions.each do |pred| - pred[:score] = pred[:score].clamp(@min_rating, @max_rating) + # clamp *after* sorting + # also, only needed for returned predictions + if @min_rating + predictions.each do |pred| + pred[:score] = pred[:score].clamp(@min_rating, @max_rating) + end end - end - predictions + predictions + end else # no items if user is unknown # TODO maybe most popular items @@ -130,6 +139,22 @@ def user_recs(user_id, count: 5, item_ids: nil) end end + def optimize_user_recs + check_fit + + require "faiss" + + # https://github.com/facebookresearch/faiss/wiki/Faiss-indexes + # TODO use non-exact index + @user_recs_index = Faiss::IndexFlatIP.new(item_factors.shape[1]) + + # ids are from 0...total + # https://github.com/facebookresearch/faiss/blob/96b740abedffc8f67389f29c2a180913941534c6/faiss/Index.h#L89 + @user_recs_index.add(item_factors) + + nil + end + def optimize_similar_items check_fit @item_index = create_index(@item_factors) diff --git a/test/recommender_test.rb b/test/recommender_test.rb index 1c3b00f..592e235 100644 --- a/test/recommender_test.rb +++ b/test/recommender_test.rb @@ -279,6 +279,25 @@ def test_daru assert_equal ["user_id", "item_id", "rating"], data.vectors.to_a end + def test_optimize_similar_users + skip "Faiss not available on Windows" if Gem.win_platform? + + data = Disco.load_movielens + recommender = Disco::Recommender.new(factors: 20) + recommender.fit(data) + + original_recs = recommender.user_recs(1) + + recommender.optimize_user_recs + + recs = recommender.user_recs(1) + assert_equal original_recs.map { |v| v[:item_id] }, recs.map { |v| v[:item_id] } + original_recs.zip(recs).each do |exp, act| + assert_in_delta exp[:score], act[:score] + end + assert_equal 5, recs.size + end + def test_optimize_similar_items skip "NGT not available on Windows" if Gem.win_platform? From 54f209d09a742e0df80961c67f3beb433d1601f1 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Fri, 19 Feb 2021 18:58:33 -0800 Subject: [PATCH 022/158] Updated headers [skip ci] --- README.md | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index bbbeddd..d2ebbfd 100644 --- a/README.md +++ b/README.md @@ -259,7 +259,9 @@ Or a Daru data frame Daru::DataFrame.from_csv("ratings.csv") ``` -## Faster User-Based Recommendations [master, experimental] +## Performance + +### User-Based Recommendations [master, experimental] If you have a large number of users/items, you can use an approximate nearest neighbors library like [Faiss](https://github.com/ankane/faiss) to speed up user-based recommendations. @@ -277,7 +279,7 @@ model.optimize_user_recs This should be called after fitting or loading the model. -## Faster Item-Based Recommendations and Similar Users +### Item-Based Recommendations and Similar Users If you have a large number of users/items, you can use an approximate nearest neighbors library like [NGT](https://github.com/ankane/ngt) to speed up item-based recommendations and similar users. From b90e81f382883e59003cbdae9b365704dad94cdb Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Fri, 19 Feb 2021 19:05:02 -0800 Subject: [PATCH 023/158] Updated readme [skip ci] --- README.md | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/README.md b/README.md index d2ebbfd..5cadfd6 100644 --- a/README.md +++ b/README.md @@ -261,17 +261,17 @@ Daru::DataFrame.from_csv("ratings.csv") ## Performance -### User-Based Recommendations [master, experimental] +If you have a large number of users or items, you can use an approximate nearest neighbors library to improve the performance of certain methods. -If you have a large number of users/items, you can use an approximate nearest neighbors library like [Faiss](https://github.com/ankane/faiss) to speed up user-based recommendations. +### User-Based Recommendations [master, experimental] -Add this line to your application’s Gemfile: +Add [Faiss](https://github.com/ankane/faiss) to your application’s Gemfile: ```ruby gem 'faiss' ``` -Speed up user-based recommendations with: +Speed up the `user_recs` method with: ```ruby model.optimize_user_recs @@ -281,21 +281,19 @@ This should be called after fitting or loading the model. ### Item-Based Recommendations and Similar Users -If you have a large number of users/items, you can use an approximate nearest neighbors library like [NGT](https://github.com/ankane/ngt) to speed up item-based recommendations and similar users. - -Add this line to your application’s Gemfile: +Add [NGT](https://github.com/ankane/ngt) to your application’s Gemfile: ```ruby gem 'ngt', '>= 0.3.0' ``` -Speed up item-based recommendations with: +Speed up the `item_recs` method with: ```ruby model.optimize_item_recs ``` -Speed up similar users with: +Speed up the `similar_users` method with: ```ruby model.optimize_similar_users From 83061758bd4e8d1694743230851ecca33d449e95 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Fri, 19 Feb 2021 19:32:36 -0800 Subject: [PATCH 024/158] Fixed CI --- gemfiles/activerecord52.gemfile | 1 + gemfiles/activerecord60.gemfile | 1 + 2 files changed, 2 insertions(+) diff --git a/gemfiles/activerecord52.gemfile b/gemfiles/activerecord52.gemfile index 939f2e0..1253ee7 100644 --- a/gemfiles/activerecord52.gemfile +++ b/gemfiles/activerecord52.gemfile @@ -9,3 +9,4 @@ gem "sqlite3" gem "daru" gem "rover-df" gem "ngt", ">= 0.3.0" +gem "faiss" diff --git a/gemfiles/activerecord60.gemfile b/gemfiles/activerecord60.gemfile index 8a4ea25..2e884d6 100644 --- a/gemfiles/activerecord60.gemfile +++ b/gemfiles/activerecord60.gemfile @@ -9,3 +9,4 @@ gem "sqlite3" gem "daru" gem "rover-df" gem "ngt", ">= 0.3.0" +gem "faiss" From b123907692f6025aec68c984b0daf5344e656e95 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Fri, 19 Feb 2021 19:55:29 -0800 Subject: [PATCH 025/158] Improved variable naming [skip ci] --- lib/disco/recommender.rb | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/lib/disco/recommender.rb b/lib/disco/recommender.rb index 103059d..a482c6a 100644 --- a/lib/disco/recommender.rb +++ b/lib/disco/recommender.rb @@ -157,24 +157,24 @@ def optimize_user_recs def optimize_similar_items check_fit - @item_index = create_index(@item_factors) + @similar_items_index = create_index(@item_factors) end alias_method :optimize_item_recs, :optimize_similar_items def optimize_similar_users check_fit - @user_index = create_index(@user_factors) + @similar_users_index = create_index(@user_factors) end def similar_items(item_id, count: 5) check_fit - similar(item_id, @item_map, @item_factors, @item_index ? nil : item_norms, count, @item_index) + similar(item_id, @item_map, @item_factors, @similar_items_index ? nil : item_norms, count, @similar_items_index) end alias_method :item_recs, :similar_items def similar_users(user_id, count: 5) check_fit - similar(user_id, @user_map, @user_factors, @user_index ? nil : user_norms, count, @user_index) + similar(user_id, @user_map, @user_factors, @similar_users_index ? nil : user_norms, count, @similar_users_index) end def user_ids From f92b766c0886d9948102d1dd1089c514ba403d4c Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Fri, 19 Feb 2021 19:56:29 -0800 Subject: [PATCH 026/158] Fixed test name [skip ci] --- test/recommender_test.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/recommender_test.rb b/test/recommender_test.rb index 592e235..b8e9239 100644 --- a/test/recommender_test.rb +++ b/test/recommender_test.rb @@ -279,7 +279,7 @@ def test_daru assert_equal ["user_id", "item_id", "rating"], data.vectors.to_a end - def test_optimize_similar_users + def test_optimize_user_recs skip "Faiss not available on Windows" if Gem.win_platform? data = Disco.load_movielens From 48e5816aa0adb5d9fd6c1ad759e2be73d9e2dedc Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Fri, 19 Feb 2021 20:44:13 -0800 Subject: [PATCH 027/158] Added support for Faiss for optimize_item_recs and optimize_similar_users methods --- CHANGELOG.md | 1 + README.md | 18 ++--------- lib/disco/recommender.rb | 69 +++++++++++++++++++++++++++------------- test/recommender_test.rb | 53 +++++++++++++++++++++++++++--- test/test_helper.rb | 6 ++++ 5 files changed, 106 insertions(+), 41 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 2a978a1..9211ce0 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,7 @@ ## 0.2.5 (unreleased) - Added `optimize_similar_users` method +- Added support for Faiss for `optimize_item_recs` and `optimize_similar_users` methods ## 0.2.4 (2021-02-15) diff --git a/README.md b/README.md index 5cadfd6..9feec90 100644 --- a/README.md +++ b/README.md @@ -259,13 +259,11 @@ Or a Daru data frame Daru::DataFrame.from_csv("ratings.csv") ``` -## Performance +## Performance [master] -If you have a large number of users or items, you can use an approximate nearest neighbors library to improve the performance of certain methods. +If you have a large number of users or items, you can use an approximate nearest neighbors library like [Faiss](https://github.com/ankane/faiss) to improve the performance of certain methods. -### User-Based Recommendations [master, experimental] - -Add [Faiss](https://github.com/ankane/faiss) to your application’s Gemfile: +Add this line to your application’s Gemfile: ```ruby gem 'faiss' @@ -277,16 +275,6 @@ Speed up the `user_recs` method with: model.optimize_user_recs ``` -This should be called after fitting or loading the model. - -### Item-Based Recommendations and Similar Users - -Add [NGT](https://github.com/ankane/ngt) to your application’s Gemfile: - -```ruby -gem 'ngt', '>= 0.3.0' -``` - Speed up the `item_recs` method with: ```ruby diff --git a/lib/disco/recommender.rb b/lib/disco/recommender.rb index a482c6a..6266be6 100644 --- a/lib/disco/recommender.rb +++ b/lib/disco/recommender.rb @@ -155,26 +155,26 @@ def optimize_user_recs nil end - def optimize_similar_items + def optimize_similar_items(library: nil) check_fit - @similar_items_index = create_index(@item_factors) + @similar_items_index = create_index(:item, library: library) end alias_method :optimize_item_recs, :optimize_similar_items - def optimize_similar_users + def optimize_similar_users(library: nil) check_fit - @similar_users_index = create_index(@user_factors) + @similar_users_index = create_index(:user, library: library) end def similar_items(item_id, count: 5) check_fit - similar(item_id, @item_map, @item_factors, @similar_items_index ? nil : item_norms, count, @similar_items_index) + similar(item_id, @item_map, @item_factors, @similar_items_index ? @item_norms : item_norms, count, @similar_items_index) end alias_method :item_recs, :similar_items def similar_users(user_id, count: 5) check_fit - similar(user_id, @user_map, @user_factors, @similar_users_index ? nil : user_norms, count, @similar_users_index) + similar(user_id, @user_map, @user_factors, @similar_users_index ? @user_norms : user_norms, count, @similar_users_index) end def user_ids @@ -205,15 +205,31 @@ def item_factors(item_id = nil) private - def create_index(factors) - require "ngt" + def create_index(key, library:) + # TODO make Faiss the default in 0.3.0 + library ||= defined?(Faiss) && !defined?(Ngt) ? "faiss" : "ngt" - # could speed up search with normalized cosine - # https://github.com/yahoojapan/NGT/issues/36 - index = Ngt::Index.new(factors.shape[1], distance_type: "Cosine") - ids = index.batch_insert(factors) - raise "Unexpected ids. Please report a bug." if ids.first != 1 || ids.last != factors.shape[0] - index + factors = send("#{key}_factors") + + case library + when "faiss" + require "faiss" + + index = Faiss::IndexFlatIP.new(factors.shape[1]) + index.add(factors / send("#{key}_norms").expand_dims(1)) + index + when "ngt" + require "ngt" + + # could speed up search with normalized cosine + # https://github.com/yahoojapan/NGT/issues/36 + index = Ngt::Index.new(factors.shape[1], distance_type: "Cosine") + ids = index.batch_insert(factors) + raise "Unexpected ids. Please report a bug." if ids.first != 1 || ids.last != factors.shape[0] + index + else + raise ArgumentError, "Invalid library: #{library}" + end end def user_norms @@ -230,19 +246,28 @@ def norms(factors) norms end + # TODO change key to user_id for similar_users in 0.3.0 def similar(id, map, factors, norms, count, index) i = map[id] if i if index && count keys = map.keys - result = index.search(factors[i, true], size: count + 1)[1..-1] - result.map do |v| - { - # ids from batch_insert start at 1 instead of 0 - item_id: keys[v[:id] - 1], - # convert cosine distance to cosine similarity - score: 1 - v[:distance] - } + + if defined?(Faiss) && index.is_a?(Faiss::Index) + distances, ids = index.search(factors[i, true].expand_dims(0) / norms[i], count + 1).map { |v| v.to_a[0] } + ids.zip(distances).map do |id, distance| + {item_id: keys[id], score: distance} + end[1..-1] + else + result = index.search(factors[i, true], size: count + 1)[1..-1] + result.map do |v| + { + # ids from batch_insert start at 1 instead of 0 + item_id: keys[v[:id] - 1], + # convert cosine distance to cosine similarity + score: 1 - v[:distance] + } + end end else # cosine similarity without norms[i] diff --git a/test/recommender_test.rb b/test/recommender_test.rb index b8e9239..cf6ba5b 100644 --- a/test/recommender_test.rb +++ b/test/recommender_test.rb @@ -280,7 +280,7 @@ def test_daru end def test_optimize_user_recs - skip "Faiss not available on Windows" if Gem.win_platform? + skip "Faiss not available on Windows" if windows? data = Disco.load_movielens recommender = Disco::Recommender.new(factors: 20) @@ -298,8 +298,8 @@ def test_optimize_user_recs assert_equal 5, recs.size end - def test_optimize_similar_items - skip "NGT not available on Windows" if Gem.win_platform? + def test_optimize_item_recs + skip "Faiss not available on Windows" if windows? data = Disco.load_movielens recommender = Disco::Recommender.new(factors: 20) @@ -307,7 +307,52 @@ def test_optimize_similar_items original_recs = recommender.item_recs("Star Wars (1977)") - recommender.optimize_similar_items + recommender.optimize_item_recs(library: "faiss") + + recs = recommender.item_recs("Star Wars (1977)") + assert_equal original_recs.map { |v| v[:item_id] }, recs.map { |v| v[:item_id] } + original_recs.zip(recs).each do |exp, act| + assert_in_delta exp[:score], act[:score] + end + assert_equal 5, recs.size + + item_ids = recs.map { |r| r[:item_id] } + assert_includes item_ids, "Empire Strikes Back, The (1980)" + assert_includes item_ids, "Return of the Jedi (1983)" + + assert_in_delta 0.9972, recs.first[:score], 0.01 + end + + def test_optimize_similar_users + skip "Faiss not available on Windows" if windows? + + data = Disco.load_movielens + recommender = Disco::Recommender.new(factors: 20) + recommender.fit(data) + + original_recs = recommender.similar_users(1) + + recommender.optimize_similar_users(library: "faiss") + + recs = recommender.similar_users(1) + + assert_equal original_recs.map { |v| v[:item_id] }, recs.map { |v| v[:item_id] } + original_recs.zip(recs).each do |exp, act| + assert_in_delta exp[:score], act[:score] + end + assert_equal 5, recs.size + end + + def test_optimize_item_recs_ngt + skip "NGT not available on Windows" if windows? + + data = Disco.load_movielens + recommender = Disco::Recommender.new(factors: 20) + recommender.fit(data) + + original_recs = recommender.item_recs("Star Wars (1977)") + + recommender.optimize_item_recs(library: "ngt") recs = recommender.item_recs("Star Wars (1977)") assert_equal original_recs.map { |v| v[:item_id] }, recs.map { |v| v[:item_id] } diff --git a/test/test_helper.rb b/test/test_helper.rb index 419a47e..27c6f72 100644 --- a/test/test_helper.rb +++ b/test/test_helper.rb @@ -8,3 +8,9 @@ require "rover" require_relative "support/active_record" + +class Minitest::Test + def windows? + Gem.win_platform? + end +end From db9105b03bcd21fcbf0349a18cc76898d150e262 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Fri, 19 Feb 2021 20:48:52 -0800 Subject: [PATCH 028/158] Added comment [skip ci] --- lib/disco/recommender.rb | 2 ++ 1 file changed, 2 insertions(+) diff --git a/lib/disco/recommender.rb b/lib/disco/recommender.rb index 6266be6..b1fe673 100644 --- a/lib/disco/recommender.rb +++ b/lib/disco/recommender.rb @@ -215,6 +215,8 @@ def create_index(key, library:) when "faiss" require "faiss" + # inner product is cosine similarity with normalized vectors + # https://github.com/facebookresearch/faiss/issues/95 index = Faiss::IndexFlatIP.new(factors.shape[1]) index.add(factors / send("#{key}_norms").expand_dims(1)) index From dd899bd9c4162d4afe23c00fc587b9c87bd9272a Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Fri, 19 Feb 2021 21:23:53 -0800 Subject: [PATCH 029/158] Added rmse method [skip ci] --- CHANGELOG.md | 1 + lib/disco.rb | 1 + lib/disco/metrics.rb | 10 ++++++++++ test/metrics_test.rb | 7 +++++++ 4 files changed, 19 insertions(+) create mode 100644 lib/disco/metrics.rb create mode 100644 test/metrics_test.rb diff --git a/CHANGELOG.md b/CHANGELOG.md index 9211ce0..17d0032 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,7 @@ - Added `optimize_similar_users` method - Added support for Faiss for `optimize_item_recs` and `optimize_similar_users` methods +- Added `rmse` method ## 0.2.4 (2021-02-15) diff --git a/lib/disco.rb b/lib/disco.rb index 5e36e1c..b8781f8 100644 --- a/lib/disco.rb +++ b/lib/disco.rb @@ -9,6 +9,7 @@ # modules require "disco/data" +require "disco/metrics" require "disco/recommender" require "disco/version" diff --git a/lib/disco/metrics.rb b/lib/disco/metrics.rb new file mode 100644 index 0000000..dfcf0b5 --- /dev/null +++ b/lib/disco/metrics.rb @@ -0,0 +1,10 @@ +module Disco + module Metrics + class << self + def rmse(act, exp) + raise ArgumentError, "Size mismatch" if act.size != exp.size + Math.sqrt(act.zip(exp).sum { |a, e| (a - e)**2 } / act.size.to_f) + end + end + end +end diff --git a/test/metrics_test.rb b/test/metrics_test.rb new file mode 100644 index 0000000..ebfc602 --- /dev/null +++ b/test/metrics_test.rb @@ -0,0 +1,7 @@ +require_relative "test_helper" + +class MetricsTest < Minitest::Test + def test_rmse + assert_in_delta 2, Disco::Metrics.rmse([0, 0, 0, 1, 1], [0, 2, 4, 1, 1]) + end +end From 93222e8965bbf03b87a5ec983bb98612aa551d9f Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Fri, 19 Feb 2021 21:26:04 -0800 Subject: [PATCH 030/158] Moved optimize tests to new file --- test/optimize_test.rb | 92 ++++++++++++++++++++++++++++++++++++++++ test/recommender_test.rb | 89 -------------------------------------- test/test_helper.rb | 6 --- 3 files changed, 92 insertions(+), 95 deletions(-) create mode 100644 test/optimize_test.rb diff --git a/test/optimize_test.rb b/test/optimize_test.rb new file mode 100644 index 0000000..319f41e --- /dev/null +++ b/test/optimize_test.rb @@ -0,0 +1,92 @@ +require_relative "test_helper" + +class OptimizeTest < Minitest::Test + def setup + skip "Not available on Windows" if windows? + end + + def test_optimize_user_recs + data = Disco.load_movielens + recommender = Disco::Recommender.new(factors: 20) + recommender.fit(data) + + original_recs = recommender.user_recs(1) + + recommender.optimize_user_recs + + recs = recommender.user_recs(1) + assert_equal original_recs.map { |v| v[:item_id] }, recs.map { |v| v[:item_id] } + original_recs.zip(recs).each do |exp, act| + assert_in_delta exp[:score], act[:score] + end + assert_equal 5, recs.size + end + + def test_optimize_item_recs + data = Disco.load_movielens + recommender = Disco::Recommender.new(factors: 20) + recommender.fit(data) + + original_recs = recommender.item_recs("Star Wars (1977)") + + recommender.optimize_item_recs(library: "faiss") + + recs = recommender.item_recs("Star Wars (1977)") + assert_equal original_recs.map { |v| v[:item_id] }, recs.map { |v| v[:item_id] } + original_recs.zip(recs).each do |exp, act| + assert_in_delta exp[:score], act[:score] + end + assert_equal 5, recs.size + + item_ids = recs.map { |r| r[:item_id] } + assert_includes item_ids, "Empire Strikes Back, The (1980)" + assert_includes item_ids, "Return of the Jedi (1983)" + + assert_in_delta 0.9972, recs.first[:score], 0.01 + end + + def test_optimize_similar_users + data = Disco.load_movielens + recommender = Disco::Recommender.new(factors: 20) + recommender.fit(data) + + original_recs = recommender.similar_users(1) + + recommender.optimize_similar_users(library: "faiss") + + recs = recommender.similar_users(1) + + assert_equal original_recs.map { |v| v[:item_id] }, recs.map { |v| v[:item_id] } + original_recs.zip(recs).each do |exp, act| + assert_in_delta exp[:score], act[:score] + end + assert_equal 5, recs.size + end + + def test_optimize_item_recs_ngt + data = Disco.load_movielens + recommender = Disco::Recommender.new(factors: 20) + recommender.fit(data) + + original_recs = recommender.item_recs("Star Wars (1977)") + + recommender.optimize_item_recs(library: "ngt") + + recs = recommender.item_recs("Star Wars (1977)") + assert_equal original_recs.map { |v| v[:item_id] }, recs.map { |v| v[:item_id] } + original_recs.zip(recs).each do |exp, act| + assert_in_delta exp[:score], act[:score] + end + assert_equal 5, recs.size + + item_ids = recs.map { |r| r[:item_id] } + assert_includes item_ids, "Empire Strikes Back, The (1980)" + assert_includes item_ids, "Return of the Jedi (1983)" + + assert_in_delta 0.9972, recs.first[:score], 0.01 + end + + def windows? + Gem.win_platform? + end +end diff --git a/test/recommender_test.rb b/test/recommender_test.rb index cf6ba5b..c23479e 100644 --- a/test/recommender_test.rb +++ b/test/recommender_test.rb @@ -278,93 +278,4 @@ def test_daru # original data frame not modified assert_equal ["user_id", "item_id", "rating"], data.vectors.to_a end - - def test_optimize_user_recs - skip "Faiss not available on Windows" if windows? - - data = Disco.load_movielens - recommender = Disco::Recommender.new(factors: 20) - recommender.fit(data) - - original_recs = recommender.user_recs(1) - - recommender.optimize_user_recs - - recs = recommender.user_recs(1) - assert_equal original_recs.map { |v| v[:item_id] }, recs.map { |v| v[:item_id] } - original_recs.zip(recs).each do |exp, act| - assert_in_delta exp[:score], act[:score] - end - assert_equal 5, recs.size - end - - def test_optimize_item_recs - skip "Faiss not available on Windows" if windows? - - data = Disco.load_movielens - recommender = Disco::Recommender.new(factors: 20) - recommender.fit(data) - - original_recs = recommender.item_recs("Star Wars (1977)") - - recommender.optimize_item_recs(library: "faiss") - - recs = recommender.item_recs("Star Wars (1977)") - assert_equal original_recs.map { |v| v[:item_id] }, recs.map { |v| v[:item_id] } - original_recs.zip(recs).each do |exp, act| - assert_in_delta exp[:score], act[:score] - end - assert_equal 5, recs.size - - item_ids = recs.map { |r| r[:item_id] } - assert_includes item_ids, "Empire Strikes Back, The (1980)" - assert_includes item_ids, "Return of the Jedi (1983)" - - assert_in_delta 0.9972, recs.first[:score], 0.01 - end - - def test_optimize_similar_users - skip "Faiss not available on Windows" if windows? - - data = Disco.load_movielens - recommender = Disco::Recommender.new(factors: 20) - recommender.fit(data) - - original_recs = recommender.similar_users(1) - - recommender.optimize_similar_users(library: "faiss") - - recs = recommender.similar_users(1) - - assert_equal original_recs.map { |v| v[:item_id] }, recs.map { |v| v[:item_id] } - original_recs.zip(recs).each do |exp, act| - assert_in_delta exp[:score], act[:score] - end - assert_equal 5, recs.size - end - - def test_optimize_item_recs_ngt - skip "NGT not available on Windows" if windows? - - data = Disco.load_movielens - recommender = Disco::Recommender.new(factors: 20) - recommender.fit(data) - - original_recs = recommender.item_recs("Star Wars (1977)") - - recommender.optimize_item_recs(library: "ngt") - - recs = recommender.item_recs("Star Wars (1977)") - assert_equal original_recs.map { |v| v[:item_id] }, recs.map { |v| v[:item_id] } - original_recs.zip(recs).each do |exp, act| - assert_in_delta exp[:score], act[:score] - end - assert_equal 5, recs.size - - item_ids = recs.map { |r| r[:item_id] } - assert_includes item_ids, "Empire Strikes Back, The (1980)" - assert_includes item_ids, "Return of the Jedi (1983)" - - assert_in_delta 0.9972, recs.first[:score], 0.01 - end end diff --git a/test/test_helper.rb b/test/test_helper.rb index 27c6f72..419a47e 100644 --- a/test/test_helper.rb +++ b/test/test_helper.rb @@ -8,9 +8,3 @@ require "rover" require_relative "support/active_record" - -class Minitest::Test - def windows? - Gem.win_platform? - end -end From 3865c1752ae5cf293c616b2fb7cdcde7849ea76d Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Fri, 19 Feb 2021 21:29:45 -0800 Subject: [PATCH 031/158] Test similar_users with NGT [skip ci] --- test/optimize_test.rb | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/test/optimize_test.rb b/test/optimize_test.rb index 319f41e..e574fc5 100644 --- a/test/optimize_test.rb +++ b/test/optimize_test.rb @@ -86,6 +86,24 @@ def test_optimize_item_recs_ngt assert_in_delta 0.9972, recs.first[:score], 0.01 end + def test_optimize_similar_users_ngt + data = Disco.load_movielens + recommender = Disco::Recommender.new(factors: 20) + recommender.fit(data) + + original_recs = recommender.similar_users(1) + + recommender.optimize_similar_users(library: "ngt") + + recs = recommender.similar_users(1) + + assert_equal original_recs.map { |v| v[:item_id] }, recs.map { |v| v[:item_id] } + original_recs.zip(recs).each do |exp, act| + assert_in_delta exp[:score], act[:score] + end + assert_equal 5, recs.size + end + def windows? Gem.win_platform? end From 6d841b8f98ec3add7b5473f5c322bbfafb804e23 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Fri, 19 Feb 2021 21:43:48 -0800 Subject: [PATCH 032/158] Prep to change key --- lib/disco/recommender.rb | 11 +++++++---- test/model_test.rb | 2 +- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/lib/disco/recommender.rb b/lib/disco/recommender.rb index b1fe673..952a6af 100644 --- a/lib/disco/recommender.rb +++ b/lib/disco/recommender.rb @@ -248,9 +248,12 @@ def norms(factors) norms end - # TODO change key to user_id for similar_users in 0.3.0 def similar(id, map, factors, norms, count, index) i = map[id] + + # TODO use user_id for similar_users in 0.3.0 + key = :item_id + if i if index && count keys = map.keys @@ -258,14 +261,14 @@ def similar(id, map, factors, norms, count, index) if defined?(Faiss) && index.is_a?(Faiss::Index) distances, ids = index.search(factors[i, true].expand_dims(0) / norms[i], count + 1).map { |v| v.to_a[0] } ids.zip(distances).map do |id, distance| - {item_id: keys[id], score: distance} + {key => keys[id], score: distance} end[1..-1] else result = index.search(factors[i, true], size: count + 1)[1..-1] result.map do |v| { # ids from batch_insert start at 1 instead of 0 - item_id: keys[v[:id] - 1], + key => keys[v[:id] - 1], # convert cosine distance to cosine similarity score: 1 - v[:distance] } @@ -278,7 +281,7 @@ def similar(id, map, factors, norms, count, index) predictions = map.keys.zip(predictions).map do |item_id, pred| - {item_id: item_id, score: pred} + {key => item_id, score: pred} end predictions.delete_at(i) diff --git a/test/model_test.rb b/test/model_test.rb index 9e41faf..677ebac 100644 --- a/test/model_test.rb +++ b/test/model_test.rb @@ -1,7 +1,7 @@ require_relative "test_helper" class ModelTest < Minitest::Test - def test_works + def test_recommendations user = User.create! products = Product.create!([{name: "Product A"}, {name: "Product B"}].shuffle) user.update_recommended_products([ From 3b12b033e5367a9a6d1e3da1486243614f8c24f0 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Fri, 19 Feb 2021 22:02:09 -0800 Subject: [PATCH 033/158] Improved performance --- lib/disco/recommender.rb | 28 +++++++++++++--------------- 1 file changed, 13 insertions(+), 15 deletions(-) diff --git a/lib/disco/recommender.rb b/lib/disco/recommender.rb index 952a6af..69e5a15 100644 --- a/lib/disco/recommender.rb +++ b/lib/disco/recommender.rb @@ -251,13 +251,13 @@ def norms(factors) def similar(id, map, factors, norms, count, index) i = map[id] - # TODO use user_id for similar_users in 0.3.0 - key = :item_id - if i - if index && count - keys = map.keys + # TODO use user_id for similar_users in 0.3.0 + key = :item_id + + keys = map.keys + if index && count if defined?(Faiss) && index.is_a?(Faiss::Index) distances, ids = index.search(factors[i, true].expand_dims(0) / norms[i], count + 1).map { |v| v.to_a[0] } ids.zip(distances).map do |id, distance| @@ -278,19 +278,17 @@ def similar(id, map, factors, norms, count, index) # cosine similarity without norms[i] # otherwise, denominator would be (norms[i] * norms) predictions = factors.inner(factors[i, true]) / norms + indexes = predictions.sort_index + indexes = indexes[(-count - 1)..-2] if count + indexes = indexes.reverse - predictions = - map.keys.zip(predictions).map do |item_id, pred| - {key => item_id, score: pred} - end - - predictions.delete_at(i) - predictions.sort_by! { |pred| -pred[:score] } # already sorted by id - predictions = predictions.first(count) if count # divide by norms[i] to get cosine similarity # only need to do for returned records - predictions.each { |pred| pred[:score] /= norms[i] } - predictions + scores = predictions[indexes] / norms[i] + + indexes.size.times.map do |i| + {key => keys[indexes[i]], score: scores[i]} + end end else [] From f12c6b1075c719819cb44a1dcd05e4640bc36312 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Fri, 19 Feb 2021 22:07:23 -0800 Subject: [PATCH 034/158] Fixed test --- lib/disco/recommender.rb | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/lib/disco/recommender.rb b/lib/disco/recommender.rb index 69e5a15..06b6102 100644 --- a/lib/disco/recommender.rb +++ b/lib/disco/recommender.rb @@ -251,7 +251,7 @@ def norms(factors) def similar(id, map, factors, norms, count, index) i = map[id] - if i + if i && factors.shape[0] > 1 # TODO use user_id for similar_users in 0.3.0 key = :item_id @@ -275,16 +275,11 @@ def similar(id, map, factors, norms, count, index) end end else - # cosine similarity without norms[i] - # otherwise, denominator would be (norms[i] * norms) - predictions = factors.inner(factors[i, true]) / norms + predictions = factors.inner(factors[i, true] / norms[i]) / norms indexes = predictions.sort_index indexes = indexes[(-count - 1)..-2] if count indexes = indexes.reverse - - # divide by norms[i] to get cosine similarity - # only need to do for returned records - scores = predictions[indexes] / norms[i] + scores = predictions[indexes] indexes.size.times.map do |i| {key => keys[indexes[i]], score: scores[i]} From a9df9f04f07a15d0e3f4aa947a9abc258eb54a7d Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Fri, 19 Feb 2021 22:23:29 -0800 Subject: [PATCH 035/158] Use Numo for performance --- lib/disco/recommender.rb | 19 +++++++++++++------ test/optimize_test.rb | 2 ++ 2 files changed, 15 insertions(+), 6 deletions(-) diff --git a/lib/disco/recommender.rb b/lib/disco/recommender.rb index 06b6102..15630b4 100644 --- a/lib/disco/recommender.rb +++ b/lib/disco/recommender.rb @@ -95,13 +95,20 @@ def user_recs(user_id, count: 5, item_ids: nil) u = @user_map[user_id] if u + rated = @rated[u] + keys = @item_map.keys + if @user_recs_index && count - distances, ids = @user_recs_index.search(@user_factors[u, true].expand_dims(0), count + @rated[u].size).map { |v| v.to_a[0] } - distances.map! { |v| v < @min_rating ? @min_rating : (v > @max_rating ? @max_rating : v) } if @min_rating - keys = @item_map.keys - ids.zip(distances).reject { |item_id, _| @rated[u][item_id] }.map do |item_id, distance| - {item_id: keys[item_id], score: distance} - end.first(count) + distances, ids = @user_recs_index.search(@user_factors[u, true].expand_dims(0), count + @rated[u].size) + distances.inplace.clip(@min_rating, @max_rating) if @min_rating + result = [] + ids[0, true].each_with_index do |item_id, i| + next if rated[item_id] + + result << {item_id: keys[item_id], score: distances[0, i]} + break if result.size == count + end + result else predictions = @item_factors.inner(@user_factors[u, true]) diff --git a/test/optimize_test.rb b/test/optimize_test.rb index e574fc5..7074bde 100644 --- a/test/optimize_test.rb +++ b/test/optimize_test.rb @@ -87,6 +87,8 @@ def test_optimize_item_recs_ngt end def test_optimize_similar_users_ngt + skip "Flaky test" + data = Disco.load_movielens recommender = Disco::Recommender.new(factors: 20) recommender.fit(data) From fe27323ca957a2b01531485b51532b17fccd8692 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Fri, 19 Feb 2021 22:57:55 -0800 Subject: [PATCH 036/158] Improved performance of user_recs --- lib/disco/recommender.rb | 63 +++++++++++++++++----------------------- 1 file changed, 26 insertions(+), 37 deletions(-) diff --git a/lib/disco/recommender.rb b/lib/disco/recommender.rb index 15630b4..2773934 100644 --- a/lib/disco/recommender.rb +++ b/lib/disco/recommender.rb @@ -94,56 +94,45 @@ def user_recs(user_id, count: 5, item_ids: nil) check_fit u = @user_map[user_id] + result = [] if u - rated = @rated[u] + rated = item_ids ? {} : @rated[u] keys = @item_map.keys - if @user_recs_index && count - distances, ids = @user_recs_index.search(@user_factors[u, true].expand_dims(0), count + @rated[u].size) - distances.inplace.clip(@min_rating, @max_rating) if @min_rating - result = [] - ids[0, true].each_with_index do |item_id, i| - next if rated[item_id] - - result << {item_id: keys[item_id], score: distances[0, i]} - break if result.size == count - end - result + if item_ids + ids = Numo::NArray.cast(item_ids.map { |i| @item_map[i] }.compact) + return [] if ids.size == 0 + + predictions = @item_factors[ids, true].inner(@user_factors[u, true]) + indexes = predictions.sort_index.reverse + indexes = indexes[0...[count + rated.size, indexes.size].min] if count + predictions = predictions[indexes] + ids = ids[indexes] + elsif @user_recs_index + predictions, ids = @user_recs_index.search(@user_factors[u, true].expand_dims(0), count + rated.size).map { |v| v[0, true] } else predictions = @item_factors.inner(@user_factors[u, true]) + # TODO make sure reverse isn't hurting performance + indexes = predictions.sort_index.reverse + indexes = indexes[0...[count + rated.size, indexes.size].min] if count + predictions = predictions[indexes] + ids = indexes + end - predictions = - @item_map.keys.zip(predictions).map do |item_id, pred| - {item_id: item_id, score: pred} - end - - if item_ids - idx = item_ids.map { |i| @item_map[i] }.compact - predictions = predictions.values_at(*idx) - else - @rated[u].keys.sort_by { |v| -v }.each do |i| - predictions.delete_at(i) - end - end - - predictions.sort_by! { |pred| -pred[:score] } # already sorted by id - predictions = predictions.first(count) if count && !item_ids + predictions.inplace.clip(@min_rating, @max_rating) if @min_rating - # clamp *after* sorting - # also, only needed for returned predictions - if @min_rating - predictions.each do |pred| - pred[:score] = pred[:score].clamp(@min_rating, @max_rating) - end - end + ids.each_with_index do |item_id, i| + next if rated[item_id] - predictions + result << {item_id: keys[item_id], score: predictions[i]} + break if result.size == count end else # no items if user is unknown # TODO maybe most popular items - [] end + + result end def optimize_user_recs From 099ae86455a61045655d3720c0fbb24878936562 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Fri, 19 Feb 2021 23:00:35 -0800 Subject: [PATCH 037/158] Improved code --- lib/disco/recommender.rb | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/lib/disco/recommender.rb b/lib/disco/recommender.rb index 2773934..8f50f57 100644 --- a/lib/disco/recommender.rb +++ b/lib/disco/recommender.rb @@ -97,7 +97,7 @@ def user_recs(user_id, count: 5, item_ids: nil) result = [] if u rated = item_ids ? {} : @rated[u] - keys = @item_map.keys + max_count = count + rated.size if item_ids ids = Numo::NArray.cast(item_ids.map { |i| @item_map[i] }.compact) @@ -105,22 +105,23 @@ def user_recs(user_id, count: 5, item_ids: nil) predictions = @item_factors[ids, true].inner(@user_factors[u, true]) indexes = predictions.sort_index.reverse - indexes = indexes[0...[count + rated.size, indexes.size].min] if count + indexes = indexes[0...[max_count, indexes.size].min] if count predictions = predictions[indexes] ids = ids[indexes] elsif @user_recs_index - predictions, ids = @user_recs_index.search(@user_factors[u, true].expand_dims(0), count + rated.size).map { |v| v[0, true] } + predictions, ids = @user_recs_index.search(@user_factors[u, true].expand_dims(0), max_count).map { |v| v[0, true] } else predictions = @item_factors.inner(@user_factors[u, true]) # TODO make sure reverse isn't hurting performance indexes = predictions.sort_index.reverse - indexes = indexes[0...[count + rated.size, indexes.size].min] if count + indexes = indexes[0...[max_count, indexes.size].min] if count predictions = predictions[indexes] ids = indexes end predictions.inplace.clip(@min_rating, @max_rating) if @min_rating + keys = @item_map.keys ids.each_with_index do |item_id, i| next if rated[item_id] From 7ce9a5851eed86e78ea626f6634373e93b52687f Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Fri, 19 Feb 2021 23:03:16 -0800 Subject: [PATCH 038/158] Improved code [skip ci] --- lib/disco/recommender.rb | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/lib/disco/recommender.rb b/lib/disco/recommender.rb index 8f50f57..3f31ca4 100644 --- a/lib/disco/recommender.rb +++ b/lib/disco/recommender.rb @@ -94,7 +94,6 @@ def user_recs(user_id, count: 5, item_ids: nil) check_fit u = @user_map[user_id] - result = [] if u rated = item_ids ? {} : @rated[u] max_count = count + rated.size @@ -122,18 +121,19 @@ def user_recs(user_id, count: 5, item_ids: nil) predictions.inplace.clip(@min_rating, @max_rating) if @min_rating keys = @item_map.keys + result = [] ids.each_with_index do |item_id, i| next if rated[item_id] result << {item_id: keys[item_id], score: predictions[i]} break if result.size == count end + result else # no items if user is unknown # TODO maybe most popular items + [] end - - result end def optimize_user_recs From d1140d66b20cde42430b05319b5414b5a7f01083 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Fri, 19 Feb 2021 23:19:29 -0800 Subject: [PATCH 039/158] Improved similar code --- lib/disco/recommender.rb | 52 +++++++++++++++++----------------------- 1 file changed, 22 insertions(+), 30 deletions(-) diff --git a/lib/disco/recommender.rb b/lib/disco/recommender.rb index 3f31ca4..ae72c48 100644 --- a/lib/disco/recommender.rb +++ b/lib/disco/recommender.rb @@ -96,7 +96,6 @@ def user_recs(user_id, count: 5, item_ids: nil) if u rated = item_ids ? {} : @rated[u] - max_count = count + rated.size if item_ids ids = Numo::NArray.cast(item_ids.map { |i| @item_map[i] }.compact) @@ -104,16 +103,16 @@ def user_recs(user_id, count: 5, item_ids: nil) predictions = @item_factors[ids, true].inner(@user_factors[u, true]) indexes = predictions.sort_index.reverse - indexes = indexes[0...[max_count, indexes.size].min] if count + indexes = indexes[0...[count + rated.size, indexes.size].min] if count predictions = predictions[indexes] ids = ids[indexes] - elsif @user_recs_index - predictions, ids = @user_recs_index.search(@user_factors[u, true].expand_dims(0), max_count).map { |v| v[0, true] } + elsif @user_recs_index && count + predictions, ids = @user_recs_index.search(@user_factors[u, true].expand_dims(0), count + rated.size).map { |v| v[0, true] } else predictions = @item_factors.inner(@user_factors[u, true]) # TODO make sure reverse isn't hurting performance indexes = predictions.sort_index.reverse - indexes = indexes[0...[max_count, indexes.size].min] if count + indexes = indexes[0...[count + rated.size, indexes.size].min] if count predictions = predictions[indexes] ids = indexes end @@ -249,38 +248,31 @@ def similar(id, map, factors, norms, count, index) i = map[id] if i && factors.shape[0] > 1 - # TODO use user_id for similar_users in 0.3.0 - key = :item_id - - keys = map.keys - if index && count if defined?(Faiss) && index.is_a?(Faiss::Index) - distances, ids = index.search(factors[i, true].expand_dims(0) / norms[i], count + 1).map { |v| v.to_a[0] } - ids.zip(distances).map do |id, distance| - {key => keys[id], score: distance} - end[1..-1] + predictions, ids = index.search(factors[i, true].expand_dims(0) / norms[i], count + 1).map { |v| v.to_a[0] } else - result = index.search(factors[i, true], size: count + 1)[1..-1] - result.map do |v| - { - # ids from batch_insert start at 1 instead of 0 - key => keys[v[:id] - 1], - # convert cosine distance to cosine similarity - score: 1 - v[:distance] - } - end + result = index.search(factors[i, true], size: count + 1) + # ids from batch_insert start at 1 instead of 0 + ids = result.map { |v| v[:id] - 1 } + # convert cosine distance to cosine similarity + predictions = result.map { |v| 1 - v[:distance] } end else predictions = factors.inner(factors[i, true] / norms[i]) / norms - indexes = predictions.sort_index - indexes = indexes[(-count - 1)..-2] if count - indexes = indexes.reverse - scores = predictions[indexes] + indexes = predictions.sort_index.reverse + indexes = indexes[0...[count + 1, indexes.size].min] if count + predictions = predictions[indexes] + ids = indexes + end - indexes.size.times.map do |i| - {key => keys[indexes[i]], score: scores[i]} - end + keys = map.keys + + # TODO use user_id for similar_users in 0.3.0 + key = :item_id + + (1...ids.size).map do |i| + {key => keys[ids[i]], score: predictions[i]} end else [] From 3f5597064e9cc027c0a7f02fdcb4ebbdf7d59601 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Fri, 19 Feb 2021 23:28:46 -0800 Subject: [PATCH 040/158] Fixed CI for now --- gemfiles/activerecord52.gemfile | 2 +- test/optimize_test.rb | 10 ++++++++++ 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/gemfiles/activerecord52.gemfile b/gemfiles/activerecord52.gemfile index 1253ee7..c436704 100644 --- a/gemfiles/activerecord52.gemfile +++ b/gemfiles/activerecord52.gemfile @@ -9,4 +9,4 @@ gem "sqlite3" gem "daru" gem "rover-df" gem "ngt", ">= 0.3.0" -gem "faiss" +# gem "faiss" diff --git a/test/optimize_test.rb b/test/optimize_test.rb index 7074bde..166f11f 100644 --- a/test/optimize_test.rb +++ b/test/optimize_test.rb @@ -6,6 +6,8 @@ def setup end def test_optimize_user_recs + skip unless faiss? + data = Disco.load_movielens recommender = Disco::Recommender.new(factors: 20) recommender.fit(data) @@ -23,6 +25,8 @@ def test_optimize_user_recs end def test_optimize_item_recs + skip unless faiss? + data = Disco.load_movielens recommender = Disco::Recommender.new(factors: 20) recommender.fit(data) @@ -46,6 +50,8 @@ def test_optimize_item_recs end def test_optimize_similar_users + skip unless faiss? + data = Disco.load_movielens recommender = Disco::Recommender.new(factors: 20) recommender.fit(data) @@ -106,6 +112,10 @@ def test_optimize_similar_users_ngt assert_equal 5, recs.size end + def faiss? + RUBY_VERSION.to_f >= 2.7 + end + def windows? Gem.win_platform? end From d9253bebe4f933fed689e51386674be8d4ca9df7 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Fri, 19 Feb 2021 23:35:17 -0800 Subject: [PATCH 041/158] Fixed CI matrix --- .github/workflows/build.yml | 2 +- test/optimize_test.rb | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 1979a37..53d4309 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -20,6 +20,6 @@ jobs: - uses: actions/checkout@v2 - uses: ruby/setup-ruby@v1 with: - ruby-version: 2.7 + ruby-version: ${{ matrix.ruby }} bundler-cache: true - run: bundle exec rake test diff --git a/test/optimize_test.rb b/test/optimize_test.rb index 166f11f..1926c9f 100644 --- a/test/optimize_test.rb +++ b/test/optimize_test.rb @@ -113,7 +113,7 @@ def test_optimize_similar_users_ngt end def faiss? - RUBY_VERSION.to_f >= 2.7 + true end def windows? From 3a050586c71170b833193cf138c48e87edcf6c7a Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Fri, 19 Feb 2021 23:38:56 -0800 Subject: [PATCH 042/158] Re-add Faiss to CI (error likely due to gem cache between different machines) --- gemfiles/activerecord52.gemfile | 2 +- test/optimize_test.rb | 10 ---------- 2 files changed, 1 insertion(+), 11 deletions(-) diff --git a/gemfiles/activerecord52.gemfile b/gemfiles/activerecord52.gemfile index c436704..1253ee7 100644 --- a/gemfiles/activerecord52.gemfile +++ b/gemfiles/activerecord52.gemfile @@ -9,4 +9,4 @@ gem "sqlite3" gem "daru" gem "rover-df" gem "ngt", ">= 0.3.0" -# gem "faiss" +gem "faiss" diff --git a/test/optimize_test.rb b/test/optimize_test.rb index 1926c9f..7074bde 100644 --- a/test/optimize_test.rb +++ b/test/optimize_test.rb @@ -6,8 +6,6 @@ def setup end def test_optimize_user_recs - skip unless faiss? - data = Disco.load_movielens recommender = Disco::Recommender.new(factors: 20) recommender.fit(data) @@ -25,8 +23,6 @@ def test_optimize_user_recs end def test_optimize_item_recs - skip unless faiss? - data = Disco.load_movielens recommender = Disco::Recommender.new(factors: 20) recommender.fit(data) @@ -50,8 +46,6 @@ def test_optimize_item_recs end def test_optimize_similar_users - skip unless faiss? - data = Disco.load_movielens recommender = Disco::Recommender.new(factors: 20) recommender.fit(data) @@ -112,10 +106,6 @@ def test_optimize_similar_users_ngt assert_equal 5, recs.size end - def faiss? - true - end - def windows? Gem.win_platform? end From 560e12746d814cac701329dea43d5340ea0a654e Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sat, 20 Feb 2021 00:06:54 -0800 Subject: [PATCH 043/158] Fixed flaky test --- test/optimize_test.rb | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/test/optimize_test.rb b/test/optimize_test.rb index 7074bde..4bd1447 100644 --- a/test/optimize_test.rb +++ b/test/optimize_test.rb @@ -86,18 +86,18 @@ def test_optimize_item_recs_ngt assert_in_delta 0.9972, recs.first[:score], 0.01 end + # flaky with count: 5 (likely due to ANN) + # however, count: 10 seems to match exactly def test_optimize_similar_users_ngt - skip "Flaky test" - data = Disco.load_movielens recommender = Disco::Recommender.new(factors: 20) recommender.fit(data) - original_recs = recommender.similar_users(1) + original_recs = recommender.similar_users(1, count: 10) recommender.optimize_similar_users(library: "ngt") - recs = recommender.similar_users(1) + recs = recommender.similar_users(1, count: 10) assert_equal original_recs.map { |v| v[:item_id] }, recs.map { |v| v[:item_id] } original_recs.zip(recs).each do |exp, act| From 08968d807cd84c2fb5217bd46b417f1fdd2cde7a Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sat, 20 Feb 2021 00:09:36 -0800 Subject: [PATCH 044/158] Fixed CI --- .github/workflows/build.yml | 4 ++++ test/optimize_test.rb | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 53d4309..74ced57 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -22,4 +22,8 @@ jobs: with: ruby-version: ${{ matrix.ruby }} bundler-cache: true + - uses: actions/cache@v2 + with: + path: ~/.disco + key: disco - run: bundle exec rake test diff --git a/test/optimize_test.rb b/test/optimize_test.rb index 4bd1447..b9ec748 100644 --- a/test/optimize_test.rb +++ b/test/optimize_test.rb @@ -103,7 +103,7 @@ def test_optimize_similar_users_ngt original_recs.zip(recs).each do |exp, act| assert_in_delta exp[:score], act[:score] end - assert_equal 5, recs.size + assert_equal 10, recs.size end def windows? From a8b04d87a44a57fe032627726c1f637aaa65d4ad Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sat, 20 Feb 2021 00:29:16 -0800 Subject: [PATCH 045/158] Improved performance by memoizing normalized factors --- CHANGELOG.md | 1 + lib/disco/recommender.rb | 36 ++++++++++++++++++------------------ 2 files changed, 19 insertions(+), 18 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 17d0032..1fd178e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,7 @@ - Added `optimize_similar_users` method - Added support for Faiss for `optimize_item_recs` and `optimize_similar_users` methods - Added `rmse` method +- Improved performance ## 0.2.4 (2021-02-15) diff --git a/lib/disco/recommender.rb b/lib/disco/recommender.rb index ae72c48..5882aaf 100644 --- a/lib/disco/recommender.rb +++ b/lib/disco/recommender.rb @@ -153,24 +153,24 @@ def optimize_user_recs def optimize_similar_items(library: nil) check_fit - @similar_items_index = create_index(:item, library: library) + @similar_items_index = create_index(item_norms, library: library) end alias_method :optimize_item_recs, :optimize_similar_items def optimize_similar_users(library: nil) check_fit - @similar_users_index = create_index(:user, library: library) + @similar_users_index = create_index(user_norms, library: library) end def similar_items(item_id, count: 5) check_fit - similar(item_id, @item_map, @item_factors, @similar_items_index ? @item_norms : item_norms, count, @similar_items_index) + similar(item_id, @item_map, item_norms, count, @similar_items_index) end alias_method :item_recs, :similar_items def similar_users(user_id, count: 5) check_fit - similar(user_id, @user_map, @user_factors, @similar_users_index ? @user_norms : user_norms, count, @similar_users_index) + similar(user_id, @user_map, user_norms, count, @similar_users_index) end def user_ids @@ -201,29 +201,29 @@ def item_factors(item_id = nil) private - def create_index(key, library:) + def create_index(norm_factors, library:) # TODO make Faiss the default in 0.3.0 library ||= defined?(Faiss) && !defined?(Ngt) ? "faiss" : "ngt" - factors = send("#{key}_factors") - case library when "faiss" require "faiss" # inner product is cosine similarity with normalized vectors # https://github.com/facebookresearch/faiss/issues/95 - index = Faiss::IndexFlatIP.new(factors.shape[1]) - index.add(factors / send("#{key}_norms").expand_dims(1)) + index = Faiss::IndexFlatIP.new(norm_factors.shape[1]) + index.add(norm_factors) index when "ngt" require "ngt" # could speed up search with normalized cosine # https://github.com/yahoojapan/NGT/issues/36 - index = Ngt::Index.new(factors.shape[1], distance_type: "Cosine") - ids = index.batch_insert(factors) - raise "Unexpected ids. Please report a bug." if ids.first != 1 || ids.last != factors.shape[0] + # + # NGT normalizes so could just pass factors, but keep code simple for now + index = Ngt::Index.new(norm_factors.shape[1], distance_type: "Cosine") + ids = index.batch_insert(norm_factors) + raise "Unexpected ids. Please report a bug." if ids.first != 1 || ids.last != norm_factors.shape[0] index else raise ArgumentError, "Invalid library: #{library}" @@ -241,25 +241,25 @@ def item_norms def norms(factors) norms = Numo::SFloat::Math.sqrt((factors * factors).sum(axis: 1)) norms[norms.eq(0)] = 1e-10 # no zeros - norms + factors / norms.expand_dims(1) end - def similar(id, map, factors, norms, count, index) + def similar(id, map, norm_factors, count, index) i = map[id] - if i && factors.shape[0] > 1 + if i && norm_factors.shape[0] > 1 if index && count if defined?(Faiss) && index.is_a?(Faiss::Index) - predictions, ids = index.search(factors[i, true].expand_dims(0) / norms[i], count + 1).map { |v| v.to_a[0] } + predictions, ids = index.search(norm_factors[i, true].expand_dims(0), count + 1).map { |v| v.to_a[0] } else - result = index.search(factors[i, true], size: count + 1) + result = index.search(norm_factors[i, true], size: count + 1) # ids from batch_insert start at 1 instead of 0 ids = result.map { |v| v[:id] - 1 } # convert cosine distance to cosine similarity predictions = result.map { |v| 1 - v[:distance] } end else - predictions = factors.inner(factors[i, true] / norms[i]) / norms + predictions = norm_factors.inner(norm_factors[i, true]) indexes = predictions.sort_index.reverse indexes = indexes[0...[count + 1, indexes.size].min] if count predictions = predictions[indexes] From 0b4fafa82eff1131008bbde0535bfa5011f7ccf4 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sat, 20 Feb 2021 01:17:04 -0800 Subject: [PATCH 046/158] Added tests for count: nil --- test/recommender_test.rb | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/test/recommender_test.rb b/test/recommender_test.rb index c23479e..77ec7b5 100644 --- a/test/recommender_test.rb +++ b/test/recommender_test.rb @@ -28,6 +28,10 @@ def test_explicit assert_includes item_ids, "Return of the Jedi (1983)" assert_in_delta 0.9972, recs.first[:score], 0.01 + + assert_equal (1664 - data.select { |v| v[:user_id] == 1 }.map { |v| v[:item_id] }.uniq.size), recommender.user_recs(1, count: nil).size + assert_equal 1663, recommender.item_recs("Star Wars (1977)", count: nil).size + assert_equal 942, recommender.similar_users(1, count: nil).size end def test_implicit From 96edff5787497a189228331955250fe420cfd03b Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sat, 20 Feb 2021 02:04:43 -0800 Subject: [PATCH 047/158] Improved predict test --- test/recommender_test.rb | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/test/recommender_test.rb b/test/recommender_test.rb index 77ec7b5..c9fa88a 100644 --- a/test/recommender_test.rb +++ b/test/recommender_test.rb @@ -169,12 +169,18 @@ def test_user_recs_new_item assert_empty [], recommender.user_recs(1, item_ids: [1000]) end - # TODO better test (need deterministic output) def test_predict data = Disco.load_movielens - recommender = Disco::Recommender.new(factors: 20) - recommender.fit(data) - assert_kind_of Array, recommender.predict(data.first(5)) + data.shuffle!(random: Random.new(1)) + + train_set = data.first(80000) + valid_set = data.last(20000) + + recommender = Disco::Recommender.new(factors: 20, verbose: false) + recommender.fit(train_set, validation_set: valid_set) + + predictions = recommender.predict(valid_set) + assert_in_delta 0.91, Disco::Metrics.rmse(valid_set.map { |v| v[:rating] }, predictions), 0.01 end def test_predict_new_user From 387173357f8049f0a0627c2239ba14b65dc59d2d Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sat, 20 Feb 2021 02:07:47 -0800 Subject: [PATCH 048/158] Fixed test --- test/optimize_test.rb | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/test/optimize_test.rb b/test/optimize_test.rb index b9ec748..312226a 100644 --- a/test/optimize_test.rb +++ b/test/optimize_test.rb @@ -86,8 +86,6 @@ def test_optimize_item_recs_ngt assert_in_delta 0.9972, recs.first[:score], 0.01 end - # flaky with count: 5 (likely due to ANN) - # however, count: 10 seems to match exactly def test_optimize_similar_users_ngt data = Disco.load_movielens recommender = Disco::Recommender.new(factors: 20) @@ -99,8 +97,13 @@ def test_optimize_similar_users_ngt recs = recommender.similar_users(1, count: 10) + # won't match exactly due to ANN + matching_ids = original_recs.map { |v| v[:item_id] } & recs.map { |v| v[:item_id] } + assert_includes 8..10, matching_ids.size assert_equal original_recs.map { |v| v[:item_id] }, recs.map { |v| v[:item_id] } - original_recs.zip(recs).each do |exp, act| + matching_ids.each do |item_id| + exp = original_recs.find { |v| v[:item_id] == item_id } + act = recs.find { |v| v[:item_id] == item_id } assert_in_delta exp[:score], act[:score] end assert_equal 10, recs.size From 73fd0e30b90b5a4f943cf45869a79147b6aa53a8 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sat, 20 Feb 2021 02:57:39 -0800 Subject: [PATCH 049/158] Use create_index method for optimize_user_recs --- lib/disco/recommender.rb | 33 +++++++++++++++------------------ 1 file changed, 15 insertions(+), 18 deletions(-) diff --git a/lib/disco/recommender.rb b/lib/disco/recommender.rb index 5882aaf..617e641 100644 --- a/lib/disco/recommender.rb +++ b/lib/disco/recommender.rb @@ -137,18 +137,7 @@ def user_recs(user_id, count: 5, item_ids: nil) def optimize_user_recs check_fit - - require "faiss" - - # https://github.com/facebookresearch/faiss/wiki/Faiss-indexes - # TODO use non-exact index - @user_recs_index = Faiss::IndexFlatIP.new(item_factors.shape[1]) - - # ids are from 0...total - # https://github.com/facebookresearch/faiss/blob/96b740abedffc8f67389f29c2a180913941534c6/faiss/Index.h#L89 - @user_recs_index.add(item_factors) - - nil + @user_recs_index = create_index(item_factors, library: "faiss") end def optimize_similar_items(library: nil) @@ -201,7 +190,8 @@ def item_factors(item_id = nil) private - def create_index(norm_factors, library:) + # factors should already be normalized for similar users/items + def create_index(factors, library:) # TODO make Faiss the default in 0.3.0 library ||= defined?(Faiss) && !defined?(Ngt) ? "faiss" : "ngt" @@ -211,8 +201,15 @@ def create_index(norm_factors, library:) # inner product is cosine similarity with normalized vectors # https://github.com/facebookresearch/faiss/issues/95 - index = Faiss::IndexFlatIP.new(norm_factors.shape[1]) - index.add(norm_factors) + # + # TODO use non-exact index + # https://github.com/facebookresearch/faiss/wiki/Faiss-indexes + index = Faiss::IndexFlatIP.new(factors.shape[1]) + + # ids are from 0...total + # https://github.com/facebookresearch/faiss/blob/96b740abedffc8f67389f29c2a180913941534c6/faiss/Index.h#L89 + index.add(factors) + index when "ngt" require "ngt" @@ -221,9 +218,9 @@ def create_index(norm_factors, library:) # https://github.com/yahoojapan/NGT/issues/36 # # NGT normalizes so could just pass factors, but keep code simple for now - index = Ngt::Index.new(norm_factors.shape[1], distance_type: "Cosine") - ids = index.batch_insert(norm_factors) - raise "Unexpected ids. Please report a bug." if ids.first != 1 || ids.last != norm_factors.shape[0] + index = Ngt::Index.new(factors.shape[1], distance_type: "Cosine") + ids = index.batch_insert(factors) + raise "Unexpected ids. Please report a bug." if ids.first != 1 || ids.last != factors.shape[0] index else raise ArgumentError, "Invalid library: #{library}" From 3a85d9ea113eef51d6c55ade18a6f0f27977d1ab Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sat, 20 Feb 2021 03:01:54 -0800 Subject: [PATCH 050/158] Updated comment [skip ci] --- lib/disco/recommender.rb | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/lib/disco/recommender.rb b/lib/disco/recommender.rb index 617e641..4e625dd 100644 --- a/lib/disco/recommender.rb +++ b/lib/disco/recommender.rb @@ -216,11 +216,13 @@ def create_index(factors, library:) # could speed up search with normalized cosine # https://github.com/yahoojapan/NGT/issues/36 - # - # NGT normalizes so could just pass factors, but keep code simple for now index = Ngt::Index.new(factors.shape[1], distance_type: "Cosine") + + # NGT normalizes so could call create_index with factors instead of norms + # but keep code simple for now ids = index.batch_insert(factors) raise "Unexpected ids. Please report a bug." if ids.first != 1 || ids.last != factors.shape[0] + index else raise ArgumentError, "Invalid library: #{library}" From df7819ea81d9879a9f0913590992b002494688a8 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sat, 20 Feb 2021 03:16:12 -0800 Subject: [PATCH 051/158] Moved optimize methods [skip ci] --- lib/disco/recommender.rb | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/lib/disco/recommender.rb b/lib/disco/recommender.rb index 4e625dd..1244290 100644 --- a/lib/disco/recommender.rb +++ b/lib/disco/recommender.rb @@ -135,22 +135,6 @@ def user_recs(user_id, count: 5, item_ids: nil) end end - def optimize_user_recs - check_fit - @user_recs_index = create_index(item_factors, library: "faiss") - end - - def optimize_similar_items(library: nil) - check_fit - @similar_items_index = create_index(item_norms, library: library) - end - alias_method :optimize_item_recs, :optimize_similar_items - - def optimize_similar_users(library: nil) - check_fit - @similar_users_index = create_index(user_norms, library: library) - end - def similar_items(item_id, count: 5) check_fit similar(item_id, @item_map, item_norms, count, @similar_items_index) @@ -188,6 +172,22 @@ def item_factors(item_id = nil) end end + def optimize_user_recs + check_fit + @user_recs_index = create_index(item_factors, library: "faiss") + end + + def optimize_similar_items(library: nil) + check_fit + @similar_items_index = create_index(item_norms, library: library) + end + alias_method :optimize_item_recs, :optimize_similar_items + + def optimize_similar_users(library: nil) + check_fit + @similar_users_index = create_index(user_norms, library: library) + end + private # factors should already be normalized for similar users/items From 084e761b46f8ad576a6c4b24f4ee6a13c2e5c6c7 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sat, 20 Feb 2021 03:53:09 -0800 Subject: [PATCH 052/158] Reset indexes after fit [skip ci] --- lib/disco/recommender.rb | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/lib/disco/recommender.rb b/lib/disco/recommender.rb index 1244290..f928a8e 100644 --- a/lib/disco/recommender.rb +++ b/lib/disco/recommender.rb @@ -67,8 +67,9 @@ def fit(train_set, validation_set: nil) @user_factors = model.p_factors(format: :numo) @item_factors = model.q_factors(format: :numo) - @user_index = nil - @item_index = nil + @user_recs_index = nil + @similar_users_index = nil + @similar_items_index = nil end # generates a prediction even if a user has already rated the item From 868b3e80f576421ffee30c2a7ac617a879adbbf9 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sat, 20 Feb 2021 10:34:08 -0800 Subject: [PATCH 053/158] Fixed test [skip ci] --- test/optimize_test.rb | 1 - 1 file changed, 1 deletion(-) diff --git a/test/optimize_test.rb b/test/optimize_test.rb index 312226a..ea3a545 100644 --- a/test/optimize_test.rb +++ b/test/optimize_test.rb @@ -100,7 +100,6 @@ def test_optimize_similar_users_ngt # won't match exactly due to ANN matching_ids = original_recs.map { |v| v[:item_id] } & recs.map { |v| v[:item_id] } assert_includes 8..10, matching_ids.size - assert_equal original_recs.map { |v| v[:item_id] }, recs.map { |v| v[:item_id] } matching_ids.each do |item_id| exp = original_recs.find { |v| v[:item_id] == item_id } act = recs.find { |v| v[:item_id] == item_id } From 305923990fe3be845b728458245b7dd4fca70018 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sat, 20 Feb 2021 10:34:51 -0800 Subject: [PATCH 054/158] Added top_items method - #8 --- CHANGELOG.md | 1 + Gemfile | 1 + README.md | 10 ++++++++ lib/disco/recommender.rb | 49 +++++++++++++++++++++++++++++++++++++--- test/recommender_test.rb | 35 ++++++++++++++++++++++++++++ 5 files changed, 93 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 1fd178e..1c892ca 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,6 @@ ## 0.2.5 (unreleased) +- Added `top_items` method - Added `optimize_similar_users` method - Added support for Faiss for `optimize_item_recs` and `optimize_similar_users` methods - Added `rmse` method diff --git a/Gemfile b/Gemfile index 4cb7fc2..cbfa939 100644 --- a/Gemfile +++ b/Gemfile @@ -10,3 +10,4 @@ gem "daru" gem "rover-df" gem "ngt", ">= 0.3.0" gem "faiss" +gem "wilson_score" diff --git a/README.md b/README.md index 9feec90..f83e8b1 100644 --- a/README.md +++ b/README.md @@ -239,6 +239,16 @@ There are a number of ways to deal with this, but here are some common ones: - For user-based recommendations, show new users the most popular items. - For item-based recommendations, make content-based recommendations with a gem like [tf-idf-similarity](https://github.com/jpmckinney/tf-idf-similarity). +Get top items with: + +```ruby +recommender = Disco::Recommender.new(top_items: true) +recommender.fit(data) +recommender.top_items +``` + +This uses [Wilson score](https://www.evanmiller.org/how-not-to-sort-by-average-rating.html) for explicit feedback (add [wilson_score](https://github.com/instacart/wilson_score) your application’s Gemfile) and item frequency for implicit feedback. + ## Data Data can be an array of hashes diff --git a/lib/disco/recommender.rb b/lib/disco/recommender.rb index f928a8e..bd123d8 100644 --- a/lib/disco/recommender.rb +++ b/lib/disco/recommender.rb @@ -2,12 +2,13 @@ module Disco class Recommender attr_reader :global_mean - def initialize(factors: 8, epochs: 20, verbose: nil) + def initialize(factors: 8, epochs: 20, verbose: nil, top_items: false) @factors = factors @epochs = epochs @verbose = verbose @user_map = {} @item_map = {} + @top_items = top_items end def fit(train_set, validation_set: nil) @@ -41,6 +42,16 @@ def fit(train_set, validation_set: nil) end @rated.default = nil + if @top_items + @item_count = [0] * @item_map.size + @item_sum = [0.0] * @item_map.size + train_set.each do |v| + i = @item_map[v[:item_id]] + @item_count[i] += 1 + @item_sum[i] += (v[value_key] || 1) + end + end + eval_set = nil if validation_set eval_set = [] @@ -129,9 +140,9 @@ def user_recs(user_id, count: 5, item_ids: nil) break if result.size == count end result + elsif @top_items + top_items(count: count) else - # no items if user is unknown - # TODO maybe most popular items [] end end @@ -147,6 +158,27 @@ def similar_users(user_id, count: 5) similar(user_id, @user_map, user_norms, count, @similar_users_index) end + def top_items(count: 5) + check_fit + raise "top_items not computed" unless @top_items + + if @implicit + scores = @item_count + else + require "wilson_score" + + range = @min_rating..@max_rating + scores = @item_sum.zip(@item_count).map { |s, c| WilsonScore.rating_lower_bound(s / c, c, range) } + end + + scores = scores.map.with_index.sort_by { |s, _| -s } + scores = scores.first(count) if count + item_ids = item_ids() + scores.map do |s, i| + {item_id: item_ids[i], score: s} + end + end + def user_ids @user_map.keys end @@ -341,6 +373,11 @@ def marshal_dump obj[:max_rating] = @max_rating end + if @top_items + obj[:item_count] = @item_count + obj[:item_sum] = @item_sum + end + obj end @@ -357,6 +394,12 @@ def marshal_load(obj) @min_rating = obj[:min_rating] @max_rating = obj[:max_rating] end + + @top_items = obj.key?(:item_count) + if @top_items + @item_count = obj[:item_count] + @item_sum = obj[:item_sum] + end end end end diff --git a/test/recommender_test.rb b/test/recommender_test.rb index c9fa88a..93c8d36 100644 --- a/test/recommender_test.rb +++ b/test/recommender_test.rb @@ -93,6 +93,41 @@ def test_rated assert_equal ["A", "B"], recommender.user_recs(2).map { |r| r[:item_id] }.sort end + def test_top_items_explicit + data = Disco.load_movielens + recommender = Disco::Recommender.new(factors: 20, top_items: true) + recommender.fit(data) + top_items = recommender.top_items + assert_equal top_items, recommender.user_recs("unknown") + + recommender = Marshal.load(Marshal.dump(recommender)) + assert_equal top_items, recommender.top_items + assert_equal top_items, recommender.user_recs("unknown") + end + + def test_top_items_implicit + data = Disco.load_movielens + data.each { |v| v.delete(:rating) } + recommender = Disco::Recommender.new(factors: 20, top_items: true) + recommender.fit(data) + top_items = recommender.top_items + assert_equal top_items, recommender.user_recs("unknown") + + recommender = Marshal.load(Marshal.dump(recommender)) + assert_equal top_items, recommender.top_items + assert_equal top_items, recommender.user_recs("unknown") + end + + def test_top_items_not_computed + data = Disco.load_movielens + recommender = Disco::Recommender.new(factors: 20) + recommender.fit(data.first(5)) + error = assert_raises do + recommender.top_items + end + assert_equal "top_items not computed", error.message + end + def test_ids data = [ {user_id: 1, item_id: "A"}, From 897584f3728c482c1f635de997cb60db418bfebb Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sat, 20 Feb 2021 10:38:51 -0800 Subject: [PATCH 055/158] Added wilson_score to all Gemfiles --- gemfiles/activerecord52.gemfile | 1 + gemfiles/activerecord60.gemfile | 1 + 2 files changed, 2 insertions(+) diff --git a/gemfiles/activerecord52.gemfile b/gemfiles/activerecord52.gemfile index 1253ee7..07f0685 100644 --- a/gemfiles/activerecord52.gemfile +++ b/gemfiles/activerecord52.gemfile @@ -10,3 +10,4 @@ gem "daru" gem "rover-df" gem "ngt", ">= 0.3.0" gem "faiss" +gem "wilson_score" diff --git a/gemfiles/activerecord60.gemfile b/gemfiles/activerecord60.gemfile index 2e884d6..ef9bee6 100644 --- a/gemfiles/activerecord60.gemfile +++ b/gemfiles/activerecord60.gemfile @@ -10,3 +10,4 @@ gem "daru" gem "rover-df" gem "ngt", ">= 0.3.0" gem "faiss" +gem "wilson_score" From 20ed83bffa7f120d7489ac53ab54f9bbc407affe Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sat, 20 Feb 2021 10:46:56 -0800 Subject: [PATCH 056/158] Version bump to 0.2.5 [skip ci] --- CHANGELOG.md | 2 +- lib/disco/version.rb | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 1c892ca..0897dd8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,4 @@ -## 0.2.5 (unreleased) +## 0.2.5 (2021-02-20) - Added `top_items` method - Added `optimize_similar_users` method diff --git a/lib/disco/version.rb b/lib/disco/version.rb index a330ec9..65f4839 100644 --- a/lib/disco/version.rb +++ b/lib/disco/version.rb @@ -1,3 +1,3 @@ module Disco - VERSION = "0.2.4" + VERSION = "0.2.5" end From 2d2ae1f6819af50a568298019b98b4476279d01c Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sat, 20 Feb 2021 10:49:54 -0800 Subject: [PATCH 057/158] Updated readme [skip ci] --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index f83e8b1..5ab2ed0 100644 --- a/README.md +++ b/README.md @@ -269,7 +269,7 @@ Or a Daru data frame Daru::DataFrame.from_csv("ratings.csv") ``` -## Performance [master] +## Performance If you have a large number of users or items, you can use an approximate nearest neighbors library like [Faiss](https://github.com/ankane/faiss) to improve the performance of certain methods. From a788de1bfcf8adc5970c953b3a583558eb8bc5b3 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sat, 20 Feb 2021 11:00:09 -0800 Subject: [PATCH 058/158] Updated readme [skip ci] --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 5ab2ed0..4e8fda3 100644 --- a/README.md +++ b/README.md @@ -44,7 +44,7 @@ recommender.fit([ ]) ``` -> Use `value` instead of rating for implicit feedback +> Use `value` instead of `rating` for implicit feedback Get user-based recommendations - “users like you also liked” From ae7a4576df377061af334809563a93c5ee712427 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sat, 20 Feb 2021 11:02:15 -0800 Subject: [PATCH 059/158] Added missing word [skip ci] --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 4e8fda3..9f51b73 100644 --- a/README.md +++ b/README.md @@ -247,7 +247,7 @@ recommender.fit(data) recommender.top_items ``` -This uses [Wilson score](https://www.evanmiller.org/how-not-to-sort-by-average-rating.html) for explicit feedback (add [wilson_score](https://github.com/instacart/wilson_score) your application’s Gemfile) and item frequency for implicit feedback. +This uses [Wilson score](https://www.evanmiller.org/how-not-to-sort-by-average-rating.html) for explicit feedback (add [wilson_score](https://github.com/instacart/wilson_score) to your application’s Gemfile) and item frequency for implicit feedback. ## Data From de2f05e9a709c07d8c9f7ae73fcb672eb02e8d2f Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sat, 20 Feb 2021 11:16:55 -0800 Subject: [PATCH 060/158] Reset norms when refitting [skip ci] --- lib/disco/recommender.rb | 3 +++ 1 file changed, 3 insertions(+) diff --git a/lib/disco/recommender.rb b/lib/disco/recommender.rb index bd123d8..9a7163d 100644 --- a/lib/disco/recommender.rb +++ b/lib/disco/recommender.rb @@ -78,6 +78,9 @@ def fit(train_set, validation_set: nil) @user_factors = model.p_factors(format: :numo) @item_factors = model.q_factors(format: :numo) + @user_norms = nil + @item_norms = nil + @user_recs_index = nil @similar_users_index = nil @similar_items_index = nil From 2449811ed9f10e94ce9681228738b61f2879e449 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sat, 20 Feb 2021 15:35:24 -0800 Subject: [PATCH 061/158] Added basic benchmarking [skip ci] --- Rakefile | 52 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 52 insertions(+) diff --git a/Rakefile b/Rakefile index 981f476..34beef8 100644 --- a/Rakefile +++ b/Rakefile @@ -7,3 +7,55 @@ Rake::TestTask.new do |t| t.pattern = "test/**/*_test.rb" t.warning = false # for daru end + +# TODO use benchmark-ips +def benchmark_user_recs(name, recommender) + ms = Benchmark.realtime do + recommender.user_ids.each do |user_id| + recommender.user_recs(user_id) + end + end + puts "%-8s %f" % [name, ms] +end + +# TODO use benchmark-ips +def benchmark_item_recs(name, recommender) + ms = Benchmark.realtime do + recommender.item_ids.each do |item_id| + recommender.item_recs(item_id) + end + end + puts "%-8s %f" % [name, ms] +end + +namespace :benchmark do + task :user_recs do + require "bundler/setup" + Bundler.require + require "benchmark" + + data = Disco.load_movielens + recommender = Disco::Recommender.new + recommender.fit(data) + + benchmark_user_recs("none", recommender) + recommender.optimize_user_recs + benchmark_user_recs("faiss", recommender) + end + + task :item_recs do + require "bundler/setup" + Bundler.require + require "benchmark" + + data = Disco.load_movielens + recommender = Disco::Recommender.new + recommender.fit(data) + + benchmark_item_recs("none", recommender) + recommender.optimize_item_recs(library: "ngt") + benchmark_item_recs("ngt", recommender) + recommender.optimize_item_recs(library: "faiss") + benchmark_item_recs("faiss", recommender) + end +end From 998f7b2374351b9972819a8ba4d7f5cc0b181468 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sat, 20 Feb 2021 16:41:06 -0800 Subject: [PATCH 062/158] Improved naming [skip ci] --- lib/disco/recommender.rb | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/lib/disco/recommender.rb b/lib/disco/recommender.rb index 9a7163d..bdcf951 100644 --- a/lib/disco/recommender.rb +++ b/lib/disco/recommender.rb @@ -78,8 +78,8 @@ def fit(train_set, validation_set: nil) @user_factors = model.p_factors(format: :numo) @item_factors = model.q_factors(format: :numo) - @user_norms = nil - @item_norms = nil + @normalized_user_factors = nil + @normalized_item_factors = nil @user_recs_index = nil @similar_users_index = nil @@ -152,13 +152,13 @@ def user_recs(user_id, count: 5, item_ids: nil) def similar_items(item_id, count: 5) check_fit - similar(item_id, @item_map, item_norms, count, @similar_items_index) + similar(item_id, @item_map, normalized_item_factors, count, @similar_items_index) end alias_method :item_recs, :similar_items def similar_users(user_id, count: 5) check_fit - similar(user_id, @user_map, user_norms, count, @similar_users_index) + similar(user_id, @user_map, normalized_user_factors, count, @similar_users_index) end def top_items(count: 5) @@ -215,13 +215,13 @@ def optimize_user_recs def optimize_similar_items(library: nil) check_fit - @similar_items_index = create_index(item_norms, library: library) + @similar_items_index = create_index(normalized_item_factors, library: library) end alias_method :optimize_item_recs, :optimize_similar_items def optimize_similar_users(library: nil) check_fit - @similar_users_index = create_index(user_norms, library: library) + @similar_users_index = create_index(normalized_user_factors, library: library) end private @@ -254,7 +254,7 @@ def create_index(factors, library:) # https://github.com/yahoojapan/NGT/issues/36 index = Ngt::Index.new(factors.shape[1], distance_type: "Cosine") - # NGT normalizes so could call create_index with factors instead of norms + # NGT normalizes so could call create_index without normalized factors # but keep code simple for now ids = index.batch_insert(factors) raise "Unexpected ids. Please report a bug." if ids.first != 1 || ids.last != factors.shape[0] @@ -265,15 +265,15 @@ def create_index(factors, library:) end end - def user_norms - @user_norms ||= norms(@user_factors) + def normalized_user_factors + @normalized_user_factors ||= normalize(@user_factors) end - def item_norms - @item_norms ||= norms(@item_factors) + def normalized_item_factors + @normalized_item_factors ||= normalize(@item_factors) end - def norms(factors) + def normalize(factors) norms = Numo::SFloat::Math.sqrt((factors * factors).sum(axis: 1)) norms[norms.eq(0)] = 1e-10 # no zeros factors / norms.expand_dims(1) From 877d5ccc6f1210196c2334712d1a9b511b173043 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sat, 20 Feb 2021 16:49:41 -0800 Subject: [PATCH 063/158] Improved inspect method [skip ci] --- CHANGELOG.md | 4 ++++ lib/disco/recommender.rb | 4 ++++ test/recommender_test.rb | 3 +++ 3 files changed, 11 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 0897dd8..ac821f5 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,7 @@ +## 0.2.6 (unreleased) + +- Improved `inspect` method + ## 0.2.5 (2021-02-20) - Added `top_items` method diff --git a/lib/disco/recommender.rb b/lib/disco/recommender.rb index bdcf951..a1c5c9d 100644 --- a/lib/disco/recommender.rb +++ b/lib/disco/recommender.rb @@ -224,6 +224,10 @@ def optimize_similar_users(library: nil) @similar_users_index = create_index(normalized_user_factors, library: library) end + def inspect + to_s # for now + end + private # factors should already be normalized for similar users/items diff --git a/test/recommender_test.rb b/test/recommender_test.rb index 93c8d36..40f5c7c 100644 --- a/test/recommender_test.rb +++ b/test/recommender_test.rb @@ -32,6 +32,9 @@ def test_explicit assert_equal (1664 - data.select { |v| v[:user_id] == 1 }.map { |v| v[:item_id] }.uniq.size), recommender.user_recs(1, count: nil).size assert_equal 1663, recommender.item_recs("Star Wars (1977)", count: nil).size assert_equal 942, recommender.similar_users(1, count: nil).size + + assert recommender.inspect.size < 50 + assert recommender.to_s.size < 50 end def test_implicit From dd019c92e46a5d9464e0af818872c3a8d5beae70 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sat, 20 Feb 2021 16:54:56 -0800 Subject: [PATCH 064/158] Fixed error with fit after loading --- CHANGELOG.md | 1 + lib/disco/recommender.rb | 8 +++++++- test/recommender_test.rb | 3 +++ 3 files changed, 11 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index ac821f5..9b558bd 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,7 @@ ## 0.2.6 (unreleased) - Improved `inspect` method +- Fixed error with `fit` after loading ## 0.2.5 (2021-02-20) diff --git a/lib/disco/recommender.rb b/lib/disco/recommender.rb index a1c5c9d..93eec61 100644 --- a/lib/disco/recommender.rb +++ b/lib/disco/recommender.rb @@ -372,7 +372,10 @@ def marshal_dump rated: @rated, global_mean: @global_mean, user_factors: @user_factors, - item_factors: @item_factors + item_factors: @item_factors, + factors: @factors, + epochs: @epochs, + verbose: @verbose } unless @implicit @@ -396,6 +399,9 @@ def marshal_load(obj) @global_mean = obj[:global_mean] @user_factors = obj[:user_factors] @item_factors = obj[:item_factors] + @factors = obj[:factors] + @epochs = obj[:epochs] + @verbose = obj[:verbose] unless @implicit @min_rating = obj[:min_rating] diff --git a/test/recommender_test.rb b/test/recommender_test.rb index 40f5c7c..7bed26c 100644 --- a/test/recommender_test.rb +++ b/test/recommender_test.rb @@ -35,6 +35,9 @@ def test_explicit assert recommender.inspect.size < 50 assert recommender.to_s.size < 50 + + # fit after loading + recommender.fit(data.first(5)) end def test_implicit From 8e1e6c85e663be1f3861a1dcc921e0d2aea4db04 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sat, 20 Feb 2021 22:05:09 -0800 Subject: [PATCH 065/158] Improved performance [skip ci] --- CHANGELOG.md | 1 + lib/disco/recommender.rb | 7 ++++--- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 9b558bd..8474bdb 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,6 @@ ## 0.2.6 (unreleased) +- Improved performance - Improved `inspect` method - Fixed error with `fit` after loading diff --git a/lib/disco/recommender.rb b/lib/disco/recommender.rb index 93eec61..b8b4d86 100644 --- a/lib/disco/recommender.rb +++ b/lib/disco/recommender.rb @@ -319,13 +319,14 @@ def similar(id, map, norm_factors, count, index) end def update_maps(train_set) - raise ArgumentError, "Missing user_id" if train_set.any? { |v| v[:user_id].nil? } - raise ArgumentError, "Missing item_id" if train_set.any? { |v| v[:item_id].nil? } - train_set.each do |v| @user_map[v[:user_id]] ||= @user_map.size @item_map[v[:item_id]] ||= @item_map.size end + + # much more efficient than checking every value in another pass + raise ArgumentError, "Missing user_id" if @user_map.key?(nil) + raise ArgumentError, "Missing item_id" if @item_map.key?(nil) end def check_ratings(ratings) From f39d76513b84348dcf0cc8f5227990172e3a066f Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sat, 20 Feb 2021 22:07:33 -0800 Subject: [PATCH 066/158] Update maps and build matrix in single pass [skip ci] --- lib/disco/recommender.rb | 22 +++++++--------------- 1 file changed, 7 insertions(+), 15 deletions(-) diff --git a/lib/disco/recommender.rb b/lib/disco/recommender.rb index b8b4d86..90bd903 100644 --- a/lib/disco/recommender.rb +++ b/lib/disco/recommender.rb @@ -27,14 +27,13 @@ def fit(train_set, validation_set: nil) end end - update_maps(train_set) - @rated = Hash.new { |hash, key| hash[key] = {} } input = [] value_key = @implicit ? :value : :rating train_set.each do |v| - u = @user_map[v[:user_id]] - i = @item_map[v[:item_id]] + # update maps and build matrix in single pass + u = (@user_map[v[:user_id]] ||= @user_map.size) + i = (@item_map[v[:item_id]] ||= @item_map.size) @rated[u][i] = true # explicit will always have a value due to check_ratings @@ -42,6 +41,10 @@ def fit(train_set, validation_set: nil) end @rated.default = nil + # much more efficient than checking every value in another pass + raise ArgumentError, "Missing user_id" if @user_map.key?(nil) + raise ArgumentError, "Missing item_id" if @item_map.key?(nil) + if @top_items @item_count = [0] * @item_map.size @item_sum = [0.0] * @item_map.size @@ -318,17 +321,6 @@ def similar(id, map, norm_factors, count, index) end end - def update_maps(train_set) - train_set.each do |v| - @user_map[v[:user_id]] ||= @user_map.size - @item_map[v[:item_id]] ||= @item_map.size - end - - # much more efficient than checking every value in another pass - raise ArgumentError, "Missing user_id" if @user_map.key?(nil) - raise ArgumentError, "Missing item_id" if @item_map.key?(nil) - end - def check_ratings(ratings) unless ratings.all? { |r| !r[:rating].nil? } raise ArgumentError, "Missing ratings" From a378161795d626e352b4efb3d0833c2e2e5c02a2 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sat, 20 Feb 2021 22:12:22 -0800 Subject: [PATCH 067/158] Added tests for invalid ratings [skip ci] --- lib/disco/recommender.rb | 6 ++++-- test/recommender_test.rb | 16 ++++++++++++++++ 2 files changed, 20 insertions(+), 2 deletions(-) diff --git a/lib/disco/recommender.rb b/lib/disco/recommender.rb index 90bd903..def16e8 100644 --- a/lib/disco/recommender.rb +++ b/lib/disco/recommender.rb @@ -17,7 +17,9 @@ def fit(train_set, validation_set: nil) check_training_set(train_set) + # TODO option to set in initializer to avoid pass @implicit = !train_set.any? { |v| v[:rating] } + unless @implicit check_ratings(train_set) @min_rating, @max_rating = train_set.minmax_by { |o| o[:rating] }.map { |o| o[:rating] } @@ -323,10 +325,10 @@ def similar(id, map, norm_factors, count, index) def check_ratings(ratings) unless ratings.all? { |r| !r[:rating].nil? } - raise ArgumentError, "Missing ratings" + raise ArgumentError, "Missing rating" end unless ratings.all? { |r| r[:rating].is_a?(Numeric) } - raise ArgumentError, "Ratings must be numeric" + raise ArgumentError, "Rating must be numeric" end end diff --git a/test/recommender_test.rb b/test/recommender_test.rb index 7bed26c..6168d59 100644 --- a/test/recommender_test.rb +++ b/test/recommender_test.rb @@ -274,6 +274,22 @@ def test_missing_item_id assert_equal "Missing item_id", error.message end + def test_missing_rating + recommender = Disco::Recommender.new + error = assert_raises ArgumentError do + recommender.fit([{user_id: 1, item_id: 1, rating: 5}, {user_id: 1, item_id: 2}]) + end + assert_equal "Missing rating", error.message + end + + def test_invalid_rating + recommender = Disco::Recommender.new + error = assert_raises ArgumentError do + recommender.fit([{user_id: 1, item_id: 1, rating: "invalid"}]) + end + assert_equal "Rating must be numeric", error.message + end + def test_multiple_user_item skip # no error for now From f9c44b59ab523c73931394bf118e04e3b1148188 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sat, 20 Feb 2021 22:13:45 -0800 Subject: [PATCH 068/158] Updated comment [skip ci] --- lib/disco/recommender.rb | 2 ++ 1 file changed, 2 insertions(+) diff --git a/lib/disco/recommender.rb b/lib/disco/recommender.rb index def16e8..2b8df4e 100644 --- a/lib/disco/recommender.rb +++ b/lib/disco/recommender.rb @@ -18,6 +18,8 @@ def fit(train_set, validation_set: nil) check_training_set(train_set) # TODO option to set in initializer to avoid pass + # could also just check first value + # but may be confusing if first value is missing @implicit = !train_set.any? { |v| v[:rating] } unless @implicit From 17db0a85a79a1fecdc3e954aa36de23e5ba6e223 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sat, 20 Feb 2021 22:27:22 -0800 Subject: [PATCH 069/158] Added rating tests for validation set [skip ci] --- test/recommender_test.rb | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/test/recommender_test.rb b/test/recommender_test.rb index 6168d59..a6f2fa0 100644 --- a/test/recommender_test.rb +++ b/test/recommender_test.rb @@ -282,6 +282,14 @@ def test_missing_rating assert_equal "Missing rating", error.message end + def test_missing_rating_validation_set + recommender = Disco::Recommender.new + error = assert_raises ArgumentError do + recommender.fit([{user_id: 1, item_id: 1, rating: 5}], validation_set: [{user_id: 1, item_id: 2}]) + end + assert_equal "Missing rating", error.message + end + def test_invalid_rating recommender = Disco::Recommender.new error = assert_raises ArgumentError do @@ -290,6 +298,14 @@ def test_invalid_rating assert_equal "Rating must be numeric", error.message end + def test_invalid_rating_validation_set + recommender = Disco::Recommender.new + error = assert_raises ArgumentError do + recommender.fit([{user_id: 1, item_id: 1, rating: 5}], validation_set: [{user_id: 1, item_id: 1, rating: "invalid"}]) + end + assert_equal "Rating must be numeric", error.message + end + def test_multiple_user_item skip # no error for now From 7257fb308d738e322753b9a6c384fc1fc4b8032d Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sat, 20 Feb 2021 22:38:51 -0800 Subject: [PATCH 070/158] Updated comment [skip ci] --- lib/disco/recommender.rb | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/disco/recommender.rb b/lib/disco/recommender.rb index 2b8df4e..e444be5 100644 --- a/lib/disco/recommender.rb +++ b/lib/disco/recommender.rb @@ -18,8 +18,8 @@ def fit(train_set, validation_set: nil) check_training_set(train_set) # TODO option to set in initializer to avoid pass - # could also just check first value - # but may be confusing if first value is missing + # could also just check first few values + # but may be confusing if they are all missing and later ones aren't @implicit = !train_set.any? { |v| v[:rating] } unless @implicit From 548804227a668c57eb932f42108ec85a4a80b6be Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sat, 20 Feb 2021 23:21:25 -0800 Subject: [PATCH 071/158] Improved code [skip ci] --- lib/disco/recommender.rb | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/lib/disco/recommender.rb b/lib/disco/recommender.rb index e444be5..2faa967 100644 --- a/lib/disco/recommender.rb +++ b/lib/disco/recommender.rb @@ -22,9 +22,10 @@ def fit(train_set, validation_set: nil) # but may be confusing if they are all missing and later ones aren't @implicit = !train_set.any? { |v| v[:rating] } + # TODO improve performance + # (catch exception instead of checking ahead of time) unless @implicit check_ratings(train_set) - @min_rating, @max_rating = train_set.minmax_by { |o| o[:rating] }.map { |o| o[:rating] } if validation_set check_ratings(validation_set) @@ -49,6 +50,11 @@ def fit(train_set, validation_set: nil) raise ArgumentError, "Missing user_id" if @user_map.key?(nil) raise ArgumentError, "Missing item_id" if @item_map.key?(nil) + # TODO improve performance + unless @implicit + @min_rating, @max_rating = train_set.minmax_by { |o| o[:rating] }.map { |o| o[:rating] } + end + if @top_items @item_count = [0] * @item_map.size @item_sum = [0.0] * @item_map.size From 3fc049f2466cd793cdfaa4319e9494dd4eaa2976 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Wed, 24 Feb 2021 11:48:12 -0800 Subject: [PATCH 072/158] Fixed issue with similar_users and item_recs returning the original user/item - fixes #9 --- CHANGELOG.md | 1 + lib/disco/recommender.rb | 10 ++++++++-- test/recommender_test.rb | 7 +++++++ 3 files changed, 16 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 8474bdb..0207f47 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,7 @@ - Improved performance - Improved `inspect` method +- Fixed issue with `similar_users` and `item_recs` returning the original user/item - Fixed error with `fit` after loading ## 0.2.5 (2021-02-20) diff --git a/lib/disco/recommender.rb b/lib/disco/recommender.rb index 2faa967..7ac1014 100644 --- a/lib/disco/recommender.rb +++ b/lib/disco/recommender.rb @@ -323,9 +323,15 @@ def similar(id, map, norm_factors, count, index) # TODO use user_id for similar_users in 0.3.0 key = :item_id - (1...ids.size).map do |i| - {key => keys[ids[i]], score: predictions[i]} + result = [] + # items can have the same score + # so original item may not be at index 0 + ids.each_with_index do |id, j| + next if id == i + + result << {key => keys[id], score: predictions[j]} end + result else [] end diff --git a/test/recommender_test.rb b/test/recommender_test.rb index a6f2fa0..241d293 100644 --- a/test/recommender_test.rb +++ b/test/recommender_test.rb @@ -99,6 +99,13 @@ def test_rated assert_equal ["A", "B"], recommender.user_recs(2).map { |r| r[:item_id] }.sort end + def test_item_recs_same_score + data = [{user_id: 952, item_id: 2057}, {user_id: 952, item_id: 2060}, {user_id: 953, item_id: 2063}] + recommender = Disco::Recommender.new(factors: 50) + recommender.fit(data) + assert_equal [2060, 2063], recommender.item_recs(2057).map { |r| r[:item_id] } + end + def test_top_items_explicit data = Disco.load_movielens recommender = Disco::Recommender.new(factors: 20, top_items: true) From 45ff023f90ff5d3e07500b6e5434f7503c4f9aa1 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Wed, 24 Feb 2021 11:53:57 -0800 Subject: [PATCH 073/158] Version bump to 0.2.6 [skip ci] --- CHANGELOG.md | 2 +- lib/disco/version.rb | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 0207f47..f1911e1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,4 @@ -## 0.2.6 (unreleased) +## 0.2.6 (2021-02-24) - Improved performance - Improved `inspect` method diff --git a/lib/disco/version.rb b/lib/disco/version.rb index 65f4839..e928bc8 100644 --- a/lib/disco/version.rb +++ b/lib/disco/version.rb @@ -1,3 +1,3 @@ module Disco - VERSION = "0.2.5" + VERSION = "0.2.6" end From 551ca0f156e581411cc40fc7660bcdc431d13876 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Wed, 24 Feb 2021 12:04:32 -0800 Subject: [PATCH 074/158] Simplify test [skip ci] --- test/recommender_test.rb | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/recommender_test.rb b/test/recommender_test.rb index 241d293..a25e1ac 100644 --- a/test/recommender_test.rb +++ b/test/recommender_test.rb @@ -100,10 +100,10 @@ def test_rated end def test_item_recs_same_score - data = [{user_id: 952, item_id: 2057}, {user_id: 952, item_id: 2060}, {user_id: 953, item_id: 2063}] + data = [{user_id: 1, item_id: "A"}, {user_id: 1, item_id: "B"}, {user_id: 2, item_id: "C"}] recommender = Disco::Recommender.new(factors: 50) recommender.fit(data) - assert_equal [2060, 2063], recommender.item_recs(2057).map { |r| r[:item_id] } + assert_equal ["B", "C"], recommender.item_recs("A").map { |r| r[:item_id] } end def test_top_items_explicit From c6bd1ba1c121209217946f990fb2a1ac970479ca Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Thu, 25 Feb 2021 17:58:20 -0800 Subject: [PATCH 075/158] Prep to use IndexHNSWFlat in 0.3.0 [skip ci] --- lib/disco/recommender.rb | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/lib/disco/recommender.rb b/lib/disco/recommender.rb index 7ac1014..f85ae78 100644 --- a/lib/disco/recommender.rb +++ b/lib/disco/recommender.rb @@ -255,8 +255,9 @@ def create_index(factors, library:) # inner product is cosine similarity with normalized vectors # https://github.com/facebookresearch/faiss/issues/95 # - # TODO use non-exact index + # TODO use non-exact index in 0.3.0 # https://github.com/facebookresearch/faiss/wiki/Faiss-indexes + # index = Faiss::IndexHNSWFlat.new(factors.shape[1], 32, :inner_product) index = Faiss::IndexFlatIP.new(factors.shape[1]) # ids are from 0...total From 642bcc052644114e8627e4f22858f976f41a14d0 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Fri, 26 Feb 2021 18:36:45 -0800 Subject: [PATCH 076/158] Updated readme [skip ci] --- README.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 9f51b73..4bcc4f7 100644 --- a/README.md +++ b/README.md @@ -282,22 +282,22 @@ gem 'faiss' Speed up the `user_recs` method with: ```ruby -model.optimize_user_recs +recommender.optimize_user_recs ``` Speed up the `item_recs` method with: ```ruby -model.optimize_item_recs +recommender.optimize_item_recs ``` Speed up the `similar_users` method with: ```ruby -model.optimize_similar_users +recommender.optimize_similar_users ``` -This should be called after fitting or loading the model. +This should be called after fitting or loading the recommender. ## Reference From cc1ba7143da020f279c9c1886d5c3021bb37e29e Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sun, 28 Feb 2021 14:28:02 -0800 Subject: [PATCH 077/158] Prep to use Numo for top items --- lib/disco/recommender.rb | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/lib/disco/recommender.rb b/lib/disco/recommender.rb index f85ae78..1e9506e 100644 --- a/lib/disco/recommender.rb +++ b/lib/disco/recommender.rb @@ -179,19 +179,22 @@ def top_items(count: 5) raise "top_items not computed" unless @top_items if @implicit - scores = @item_count + scores = Numo::UInt64.cast(@item_count) else require "wilson_score" + # TODO use Numo instead of wilson_score gem for best performance range = @min_rating..@max_rating - scores = @item_sum.zip(@item_count).map { |s, c| WilsonScore.rating_lower_bound(s / c, c, range) } + scores = Numo::DFloat.cast(@item_sum.zip(@item_count).map { |s, c| WilsonScore.rating_lower_bound(s / c, c, range) }) end - scores = scores.map.with_index.sort_by { |s, _| -s } - scores = scores.first(count) if count - item_ids = item_ids() - scores.map do |s, i| - {item_id: item_ids[i], score: s} + indexes = scores.sort_index.reverse + indexes = indexes[0...[count, indexes.size].min] if count + scores = scores[indexes] + + keys = @item_map.keys + indexes.size.times.map do |i| + {item_id: keys[indexes[i]], score: scores[i]} end end From cbfc70155a23ef70171bf3817afecf4ba8934453 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sun, 28 Feb 2021 15:13:38 -0800 Subject: [PATCH 078/158] Added code to remove wilson_score dependency for top_items [skip ci] --- lib/disco/recommender.rb | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/lib/disco/recommender.rb b/lib/disco/recommender.rb index 1e9506e..a3ffc3a 100644 --- a/lib/disco/recommender.rb +++ b/lib/disco/recommender.rb @@ -183,9 +183,19 @@ def top_items(count: 5) else require "wilson_score" - # TODO use Numo instead of wilson_score gem for best performance range = @min_rating..@max_rating scores = Numo::DFloat.cast(@item_sum.zip(@item_count).map { |s, c| WilsonScore.rating_lower_bound(s / c, c, range) }) + + # TODO uncomment in 0.3.0 + # wilson score with continuity correction + # https://en.wikipedia.org/wiki/Binomial_proportion_confidence_interval#Wilson_score_interval_with_continuity_correction + # z = 1.96 # 95% confidence + # range = @max_rating - @min_rating + # n = Numo::DFloat.cast(@item_count) + # phat = (Numo::DFloat.cast(@item_sum) - (@min_rating * n)) / range / n + # phat = (phat - (1 / 2 * n)).clip(0, 100) # continuity correction + # scores = (phat + z**2 / (2 * n) - z * Numo::DFloat::Math.sqrt((phat * (1 - phat) + z**2 / (4 * n)) / n)) / (1 + z**2 / n) + # scores = scores * range + @min_rating end indexes = scores.sort_index.reverse From 73e16d801ed7907d24a4ce8365a178d41db3fd16 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sun, 28 Feb 2021 15:24:06 -0800 Subject: [PATCH 079/158] Updated comment [skip ci] --- lib/disco/recommender.rb | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/lib/disco/recommender.rb b/lib/disco/recommender.rb index a3ffc3a..a9faf91 100644 --- a/lib/disco/recommender.rb +++ b/lib/disco/recommender.rb @@ -138,8 +138,7 @@ def user_recs(user_id, count: 5, item_ids: nil) predictions, ids = @user_recs_index.search(@user_factors[u, true].expand_dims(0), count + rated.size).map { |v| v[0, true] } else predictions = @item_factors.inner(@user_factors[u, true]) - # TODO make sure reverse isn't hurting performance - indexes = predictions.sort_index.reverse + indexes = predictions.sort_index.reverse # reverse just creates view indexes = indexes[0...[count + rated.size, indexes.size].min] if count predictions = predictions[indexes] ids = indexes From d37ccc8c9b9d8b712e2615aa6755853d5e853e07 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Fri, 23 Apr 2021 00:20:20 -0700 Subject: [PATCH 080/158] Added link to Neighbor examples [skip ci] --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 4bcc4f7..2c0faca 100644 --- a/README.md +++ b/README.md @@ -201,7 +201,7 @@ bin = File.binread("recommender.bin") recommender = Marshal.load(bin) ``` -Alternatively, you can store only the factors and use a library like [Neighbor](https://github.com/ankane/neighbor) +Alternatively, you can store only the factors and use a library like [Neighbor](https://github.com/ankane/neighbor). See the [examples](https://github.com/ankane/neighbor/tree/master/examples). ## Algorithms From b0fa612509a5943f282d72cfbc414a04e6d53b6b Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Thu, 29 Jul 2021 11:09:18 -0700 Subject: [PATCH 081/158] Use ActiveRecord::Schema.define for test setup [skip ci] --- test/support/active_record.rb | 30 ++++++++++++++++-------------- 1 file changed, 16 insertions(+), 14 deletions(-) diff --git a/test/support/active_record.rb b/test/support/active_record.rb index 0c97c1d..56e09e7 100644 --- a/test/support/active_record.rb +++ b/test/support/active_record.rb @@ -7,20 +7,22 @@ # migrations ActiveRecord::Base.establish_connection adapter: "sqlite3", database: ":memory:" -ActiveRecord::Migration.create_table :users do |t| - t.string :name -end - -ActiveRecord::Migration.create_table :products do |t| - t.string :name -end - -ActiveRecord::Migration.create_table :disco_recommendations do |t| - t.references :subject, polymorphic: true - t.references :item, polymorphic: true - t.float :score - t.string :context - t.timestamps +ActiveRecord::Schema.define do + create_table :users do |t| + t.string :name + end + + create_table :products do |t| + t.string :name + end + + create_table :disco_recommendations do |t| + t.references :subject, polymorphic: true + t.references :item, polymorphic: true + t.float :score + t.string :context + t.timestamps + end end class User < ActiveRecord::Base From 21c7f0f8dd72a0e7133969a86e5d9487c463c95c Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Fri, 6 Aug 2021 11:11:42 -0700 Subject: [PATCH 082/158] Added warning for value - closes #13 --- CHANGELOG.md | 4 ++++ README.md | 35 +++++++++++++++++++++++++++-------- lib/disco/recommender.rb | 11 +++++++---- test/recommender_test.rb | 12 ++++++++++-- 4 files changed, 48 insertions(+), 14 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index f1911e1..394cdc7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,7 @@ +## 0.2.7 (unreleased) + +- Added warning for `value` + ## 0.2.6 (2021-02-24) - Improved performance diff --git a/README.md b/README.md index 2c0faca..3b1a51c 100644 --- a/README.md +++ b/README.md @@ -35,17 +35,15 @@ recommender.fit([ > IDs can be integers, strings, or any other data type -If users don’t rate items directly (for instance, they’re purchasing items or reading posts), this is known as implicit feedback. Leave out the rating, or use a value like number of purchases, number of page views, or time spent on page: +If users don’t rate items directly (for instance, they’re purchasing items or reading posts), this is known as implicit feedback. Leave out the rating. ```ruby recommender.fit([ - {user_id: 1, item_id: 1, value: 1}, - {user_id: 2, item_id: 1, value: 1} + {user_id: 1, item_id: 1}, + {user_id: 2, item_id: 1} ]) ``` -> Use `value` instead of `rating` for implicit feedback - Get user-based recommendations - “users like you also liked” ```ruby @@ -106,11 +104,10 @@ views = Ahoy::Event. count data = - views.map do |(user_id, post_id), count| + views.map do |(user_id, post_id), _| { user_id: user_id, - item_id: post_id, - value: count + item_id: post_id } end ``` @@ -336,6 +333,28 @@ Thanks to: - [Implicit](https://github.com/benfred/implicit/) for serving as an initial reference for user and item similarity - [@dasch](https://github.com/dasch) for the gem name +## Upgrading + +### 0.2.7 + +There’s now a warning when passing `:value` with implicit feedback, as this has no effect on recommendations and can be removed. Earlier versions of the library incorrectly stated this was used. + +```ruby +recommender.fit([ + {user_id: 1, item_id: 1, value: 1}, + {user_id: 2, item_id: 1, value: 3} +]) +``` + +to: + +```ruby +recommender.fit([ + {user_id: 1, item_id: 1}, + {user_id: 2, item_id: 1} +]) +``` + ## History View the [changelog](https://github.com/ankane/disco/blob/master/CHANGELOG.md) diff --git a/lib/disco/recommender.rb b/lib/disco/recommender.rb index a9faf91..2a4a7dc 100644 --- a/lib/disco/recommender.rb +++ b/lib/disco/recommender.rb @@ -22,6 +22,10 @@ def fit(train_set, validation_set: nil) # but may be confusing if they are all missing and later ones aren't @implicit = !train_set.any? { |v| v[:rating] } + if train_set.any? { |v| v[:value] } + warn "[disco] WARNING: Passing `:value` with implicit feedback has no effect on recommendations and can be removed. Earlier versions of the library incorrectly stated this was used." + end + # TODO improve performance # (catch exception instead of checking ahead of time) unless @implicit @@ -34,7 +38,6 @@ def fit(train_set, validation_set: nil) @rated = Hash.new { |hash, key| hash[key] = {} } input = [] - value_key = @implicit ? :value : :rating train_set.each do |v| # update maps and build matrix in single pass u = (@user_map[v[:user_id]] ||= @user_map.size) @@ -42,7 +45,7 @@ def fit(train_set, validation_set: nil) @rated[u][i] = true # explicit will always have a value due to check_ratings - input << [u, i, v[value_key] || 1] + input << [u, i, @implicit ? 1 : v[:rating]] end @rated.default = nil @@ -61,7 +64,7 @@ def fit(train_set, validation_set: nil) train_set.each do |v| i = @item_map[v[:item_id]] @item_count[i] += 1 - @item_sum[i] += (v[value_key] || 1) + @item_sum[i] += (@implicit ? 1 : v[:rating]) end end @@ -76,7 +79,7 @@ def fit(train_set, validation_set: nil) u ||= -1 i ||= -1 - eval_set << [u, i, v[value_key] || 1] + eval_set << [u, i, @implicit ? 1 : v[:rating]] end end diff --git a/test/recommender_test.rb b/test/recommender_test.rb index a25e1ac..d1586c0 100644 --- a/test/recommender_test.rb +++ b/test/recommender_test.rb @@ -75,8 +75,8 @@ def test_examples recommender = Disco::Recommender.new recommender.fit([ - {user_id: 1, item_id: 1, value: 1}, - {user_id: 2, item_id: 1, value: 2} + {user_id: 1, item_id: 1}, + {user_id: 2, item_id: 1} ]) recommender.user_recs(1) recommender.item_recs(1) @@ -313,6 +313,14 @@ def test_invalid_rating_validation_set assert_equal "Rating must be numeric", error.message end + def test_value + recommender = Disco::Recommender.new + _, stderr = capture_io do + recommender.fit([{user_id: 1, item_id: 1, value: 5}]) + end + assert_match "[disco] WARNING: Passing `:value` with implicit feedback has no effect on recommendations", stderr + end + def test_multiple_user_item skip # no error for now From 2ec82bc176b50f33b098cc1ce26bf21e2e6ba439 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Fri, 6 Aug 2021 11:15:15 -0700 Subject: [PATCH 083/158] Only check if implicit --- lib/disco/recommender.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/disco/recommender.rb b/lib/disco/recommender.rb index 2a4a7dc..5e0c5d6 100644 --- a/lib/disco/recommender.rb +++ b/lib/disco/recommender.rb @@ -22,7 +22,7 @@ def fit(train_set, validation_set: nil) # but may be confusing if they are all missing and later ones aren't @implicit = !train_set.any? { |v| v[:rating] } - if train_set.any? { |v| v[:value] } + if @implicit && train_set.any? { |v| v[:value] } warn "[disco] WARNING: Passing `:value` with implicit feedback has no effect on recommendations and can be removed. Earlier versions of the library incorrectly stated this was used." end From 02c479653fc0f5114c4bf302a8a13150a67c112b Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Fri, 6 Aug 2021 11:15:50 -0700 Subject: [PATCH 084/158] Version bump to 0.2.7 [skip ci] --- CHANGELOG.md | 2 +- lib/disco/version.rb | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 394cdc7..7017be8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,4 @@ -## 0.2.7 (unreleased) +## 0.2.7 (2021-08-06) - Added warning for `value` diff --git a/lib/disco/version.rb b/lib/disco/version.rb index e928bc8..e8a00c8 100644 --- a/lib/disco/version.rb +++ b/lib/disco/version.rb @@ -1,3 +1,3 @@ module Disco - VERSION = "0.2.6" + VERSION = "0.2.7" end From ea93db1c604202eda57c03e6c72caada69d73591 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Wed, 11 Aug 2021 20:24:37 -0700 Subject: [PATCH 085/158] Fixed formula [skip ci] --- lib/disco/recommender.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/disco/recommender.rb b/lib/disco/recommender.rb index 5e0c5d6..ba8f493 100644 --- a/lib/disco/recommender.rb +++ b/lib/disco/recommender.rb @@ -195,7 +195,7 @@ def top_items(count: 5) # range = @max_rating - @min_rating # n = Numo::DFloat.cast(@item_count) # phat = (Numo::DFloat.cast(@item_sum) - (@min_rating * n)) / range / n - # phat = (phat - (1 / 2 * n)).clip(0, 100) # continuity correction + # phat = (phat - (1 / (2 * n))).clip(0, 100) # continuity correction # scores = (phat + z**2 / (2 * n) - z * Numo::DFloat::Math.sqrt((phat * (1 - phat) + z**2 / (4 * n)) / n)) / (1 + z**2 / n) # scores = scores * range + @min_rating end From 676ddedb67319cc4d8206aebbd77b7ae96cf446e Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Wed, 11 Aug 2021 20:30:40 -0700 Subject: [PATCH 086/158] No need for max [skip ci] --- lib/disco/recommender.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/disco/recommender.rb b/lib/disco/recommender.rb index ba8f493..e763f27 100644 --- a/lib/disco/recommender.rb +++ b/lib/disco/recommender.rb @@ -195,7 +195,7 @@ def top_items(count: 5) # range = @max_rating - @min_rating # n = Numo::DFloat.cast(@item_count) # phat = (Numo::DFloat.cast(@item_sum) - (@min_rating * n)) / range / n - # phat = (phat - (1 / (2 * n))).clip(0, 100) # continuity correction + # phat = (phat - (1 / (2 * n))).clip(0, nil) # continuity correction # scores = (phat + z**2 / (2 * n) - z * Numo::DFloat::Math.sqrt((phat * (1 - phat) + z**2 / (4 * n)) / n)) / (1 + z**2 / n) # scores = scores * range + @min_rating end From 20deab5410df0e07ed14f022303fa009e39c76b7 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Wed, 20 Oct 2021 23:27:37 -0700 Subject: [PATCH 087/158] Use group_prop in example [skip ci] --- README.md | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/README.md b/README.md index 3b1a51c..2caa118 100644 --- a/README.md +++ b/README.md @@ -97,11 +97,7 @@ recommender.item_recs("Star Wars (1977)") [Ahoy](https://github.com/ankane/ahoy) is a great source for implicit feedback ```ruby -views = Ahoy::Event. - where(name: "Viewed post"). - group(:user_id). - group("properties->>'post_id'"). # postgres syntax - count +views = Ahoy::Event.where(name: "Viewed post").group(:user_id).group_prop(:post_id).count data = views.map do |(user_id, post_id), _| From 3d772fd1b383f6df742e3c522da3afb305560305 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Mon, 6 Dec 2021 19:50:08 -0800 Subject: [PATCH 088/158] Test with Active Record 7 rc1 --- .github/workflows/build.yml | 3 ++- gemfiles/activerecord70.gemfile | 13 +++++++++++++ 2 files changed, 15 insertions(+), 1 deletion(-) create mode 100644 gemfiles/activerecord70.gemfile diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 74ced57..1afa2c2 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -2,13 +2,14 @@ name: build on: [push, pull_request] jobs: build: - if: "!contains(github.event.head_commit.message, '[skip ci]')" strategy: fail-fast: false matrix: include: - ruby: 3.0 gemfile: Gemfile + - ruby: 3.0 + gemfile: gemfiles/activerecord70.gemfile - ruby: 2.7 gemfile: gemfiles/activerecord60.gemfile - ruby: 2.6 diff --git a/gemfiles/activerecord70.gemfile b/gemfiles/activerecord70.gemfile new file mode 100644 index 0000000..de3ce3b --- /dev/null +++ b/gemfiles/activerecord70.gemfile @@ -0,0 +1,13 @@ +source "https://rubygems.org" + +gemspec path: ".." + +gem "rake" +gem "minitest", ">= 5" +gem "activerecord", "~> 7.0.0.rc1" +gem "sqlite3" +gem "daru" +gem "rover-df" +gem "ngt", ">= 0.3.0" +gem "faiss" +gem "wilson_score" From d936911eda1a9ebabe622c580ec7ace3cf7b88f2 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Wed, 8 Dec 2021 22:44:40 -0800 Subject: [PATCH 089/158] Added note [skip ci] --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index 2caa118..79a4458 100644 --- a/README.md +++ b/README.md @@ -44,6 +44,8 @@ recommender.fit([ ]) ``` +> Each `user_id`/`item_id` combination should only appear once + Get user-based recommendations - “users like you also liked” ```ruby From 82101e2cae7884591f9dd2fb6f87bd3b42f90bfe Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Wed, 15 Dec 2021 13:57:40 -0800 Subject: [PATCH 090/158] Added comment [skip ci] --- lib/disco/model.rb | 1 + 1 file changed, 1 insertion(+) diff --git a/lib/disco/model.rb b/lib/disco/model.rb index 2a29d35..55e4af1 100644 --- a/lib/disco/model.rb +++ b/lib/disco/model.rb @@ -10,6 +10,7 @@ def has_recommended(name, class_name: nil) has_many :"recommended_#{name}", -> { where("disco_recommendations.context = ?", name).order("disco_recommendations.score DESC") }, through: :recommendations, source: :item, source_type: class_name + # TODO use fetch for item_id and score in 0.3.0 define_method("update_recommended_#{name}") do |items| now = Time.now items = items.map { |item| {subject_type: model_name.name, subject_id: id, item_type: class_name, item_id: item[:item_id], context: name, score: item[:score], created_at: now, updated_at: now} } From a2a510dcf5b2b6a5956cdacc0cf8f1813e73000d Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Wed, 15 Dec 2021 19:07:07 -0800 Subject: [PATCH 091/158] Test with Active Record 7 by default --- .github/workflows/build.yml | 2 +- Gemfile | 2 +- gemfiles/{activerecord70.gemfile => activerecord61.gemfile} | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) rename gemfiles/{activerecord70.gemfile => activerecord61.gemfile} (83%) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 1afa2c2..d5bb628 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -9,7 +9,7 @@ jobs: - ruby: 3.0 gemfile: Gemfile - ruby: 3.0 - gemfile: gemfiles/activerecord70.gemfile + gemfile: gemfiles/activerecord61.gemfile - ruby: 2.7 gemfile: gemfiles/activerecord60.gemfile - ruby: 2.6 diff --git a/Gemfile b/Gemfile index cbfa939..3967faf 100644 --- a/Gemfile +++ b/Gemfile @@ -4,7 +4,7 @@ gemspec gem "rake" gem "minitest", ">= 5" -gem "activerecord" +gem "activerecord", "~> 7.0.0" gem "sqlite3" gem "daru" gem "rover-df" diff --git a/gemfiles/activerecord70.gemfile b/gemfiles/activerecord61.gemfile similarity index 83% rename from gemfiles/activerecord70.gemfile rename to gemfiles/activerecord61.gemfile index de3ce3b..cc2fd6a 100644 --- a/gemfiles/activerecord70.gemfile +++ b/gemfiles/activerecord61.gemfile @@ -4,7 +4,7 @@ gemspec path: ".." gem "rake" gem "minitest", ">= 5" -gem "activerecord", "~> 7.0.0.rc1" +gem "activerecord", "~> 6.1.0" gem "sqlite3" gem "daru" gem "rover-df" From b78561fd4c5c056772ffc952575d56b933068c25 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Fri, 7 Jan 2022 14:22:09 -0500 Subject: [PATCH 092/158] Test with Ruby 3.1 on CI --- .github/workflows/build.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index d5bb628..4a5dcbf 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -6,9 +6,9 @@ jobs: fail-fast: false matrix: include: - - ruby: 3.0 + - ruby: 3.1 gemfile: Gemfile - - ruby: 3.0 + - ruby: "3.0" gemfile: gemfiles/activerecord61.gemfile - ruby: 2.7 gemfile: gemfiles/activerecord60.gemfile From f72634dec20cc1f8a2e40231a0adda9911271cba Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Fri, 7 Jan 2022 14:34:30 -0500 Subject: [PATCH 093/158] Fixed CI --- Gemfile | 1 + 1 file changed, 1 insertion(+) diff --git a/Gemfile b/Gemfile index 3967faf..0972477 100644 --- a/Gemfile +++ b/Gemfile @@ -7,6 +7,7 @@ gem "minitest", ">= 5" gem "activerecord", "~> 7.0.0" gem "sqlite3" gem "daru" +gem "matrix" # for daru gem "rover-df" gem "ngt", ">= 0.3.0" gem "faiss" From 903c6e386cde44ad69ac19c4f528eb105aab5b73 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Tue, 15 Feb 2022 14:25:23 -0800 Subject: [PATCH 094/158] Added test for top items with no range - #20 --- test/recommender_test.rb | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/test/recommender_test.rb b/test/recommender_test.rb index d1586c0..b542d00 100644 --- a/test/recommender_test.rb +++ b/test/recommender_test.rb @@ -141,6 +141,18 @@ def test_top_items_not_computed assert_equal "top_items not computed", error.message end + def test_top_items_no_range + data = [ + {user_id: 1, item_id: "A", rating: 5}, + {user_id: 1, item_id: "B", rating: 5}, + {user_id: 2, item_id: "B", rating: 5} + ] + recommender = Disco::Recommender.new(factors: 20, top_items: true) + recommender.fit(data) + # TODO fix + # recommender.top_items + end + def test_ids data = [ {user_id: 1, item_id: "A"}, From f868b820e18eee37806458824db1dac4d9227dde Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Tue, 15 Feb 2022 14:31:40 -0800 Subject: [PATCH 095/158] Fixed error with top_items with all same rating - fixes #20 --- CHANGELOG.md | 4 ++++ lib/disco/recommender.rb | 8 +++++++- test/recommender_test.rb | 3 +-- 3 files changed, 12 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 7017be8..aba8e13 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,7 @@ +## 0.2.8 (unreleased) + +- Fixed error with `top_items` with all same rating + ## 0.2.7 (2021-08-06) - Added warning for `value` diff --git a/lib/disco/recommender.rb b/lib/disco/recommender.rb index e763f27..c25f927 100644 --- a/lib/disco/recommender.rb +++ b/lib/disco/recommender.rb @@ -185,7 +185,13 @@ def top_items(count: 5) else require "wilson_score" - range = @min_rating..@max_rating + range = + if @min_rating == @max_rating + # TODO remove temp fix + (@min_rating - 1)..@max_rating + else + @min_rating..@max_rating + end scores = Numo::DFloat.cast(@item_sum.zip(@item_count).map { |s, c| WilsonScore.rating_lower_bound(s / c, c, range) }) # TODO uncomment in 0.3.0 diff --git a/test/recommender_test.rb b/test/recommender_test.rb index b542d00..837b01b 100644 --- a/test/recommender_test.rb +++ b/test/recommender_test.rb @@ -149,8 +149,7 @@ def test_top_items_no_range ] recommender = Disco::Recommender.new(factors: 20, top_items: true) recommender.fit(data) - # TODO fix - # recommender.top_items + assert_equal ["B", "A"], recommender.top_items.map { |r| r[:item_id] } end def test_ids From 75708a7cb9056eea83d10b84d020a1e5bb8717be Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sun, 13 Mar 2022 16:25:12 -0700 Subject: [PATCH 096/158] Version bump to 0.2.8 [skip ci] --- CHANGELOG.md | 2 +- lib/disco/version.rb | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index aba8e13..e52a6f6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,4 @@ -## 0.2.8 (unreleased) +## 0.2.8 (2022-03-13) - Fixed error with `top_items` with all same rating diff --git a/lib/disco/version.rb b/lib/disco/version.rb index e8a00c8..3bbfdc2 100644 --- a/lib/disco/version.rb +++ b/lib/disco/version.rb @@ -1,3 +1,3 @@ module Disco - VERSION = "0.2.7" + VERSION = "0.2.8" end From 34063fc8aede924cac0bbecdd11da8268c69dcb7 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Tue, 22 Mar 2022 01:47:04 -0700 Subject: [PATCH 097/158] Fixed error with load_movielens --- CHANGELOG.md | 4 ++++ lib/disco.rb | 5 ----- lib/disco/data.rb | 11 +++++++++-- test/test_helper.rb | 1 - 4 files changed, 13 insertions(+), 8 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index e52a6f6..4ea4144 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,7 @@ +## 0.2.9 (unreleased) + +- Fixed error with `load_movielens` + ## 0.2.8 (2022-03-13) - Fixed error with `top_items` with all same rating diff --git a/lib/disco.rb b/lib/disco.rb index b8781f8..966c039 100644 --- a/lib/disco.rb +++ b/lib/disco.rb @@ -2,11 +2,6 @@ require "libmf" require "numo/narray" -# stdlib -require "csv" -require "fileutils" -require "net/http" - # modules require "disco/data" require "disco/metrics" diff --git a/lib/disco/data.rb b/lib/disco/data.rb index 451972c..025d11a 100644 --- a/lib/disco/data.rb +++ b/lib/disco/data.rb @@ -1,9 +1,11 @@ module Disco module Data def load_movielens - item_path = download_file("ml-100k/u.item", "http://files.grouplens.org/datasets/movielens/ml-100k/u.item", + require "csv" + + item_path = download_file("ml-100k/u.item", "https://files.grouplens.org/datasets/movielens/ml-100k/u.item", file_hash: "553841ebc7de3a0fd0d6b62a204ea30c1e651aacfb2814c7a6584ac52f2c5701") - data_path = download_file("ml-100k/u.data", "http://files.grouplens.org/datasets/movielens/ml-100k/u.data", + data_path = download_file("ml-100k/u.data", "https://files.grouplens.org/datasets/movielens/ml-100k/u.data", file_hash: "06416e597f82b7342361e41163890c81036900f418ad91315590814211dca490") # convert u.item to utf-8 @@ -29,6 +31,11 @@ def load_movielens private def download_file(fname, origin, file_hash:) + require "digest" + require "fileutils" + require "net/http" + require "tmpdir" + # TODO handle this better raise "No HOME" unless ENV["HOME"] dest = "#{ENV["HOME"]}/.disco/#{fname}" diff --git a/test/test_helper.rb b/test/test_helper.rb index 419a47e..1da70ae 100644 --- a/test/test_helper.rb +++ b/test/test_helper.rb @@ -3,7 +3,6 @@ Bundler.require(:default) require "minitest/autorun" require "minitest/pride" -require "csv" require "daru" require "rover" From 5d5ed61f0f3ea8948032ef34b4e285d9f99b26ec Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Tue, 22 Mar 2022 01:47:21 -0700 Subject: [PATCH 098/158] Updated readme [skip ci] --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 79a4458..c19150f 100644 --- a/README.md +++ b/README.md @@ -13,7 +13,7 @@ Add this line to your application’s Gemfile: ```ruby -gem 'disco' +gem "disco" ``` ## Getting Started From 422bbf299ee9c169a73e55ab4c580ffd426fb3b4 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Tue, 22 Mar 2022 11:48:30 -0700 Subject: [PATCH 099/158] Updated readme [skip ci] --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index c19150f..a7168e1 100644 --- a/README.md +++ b/README.md @@ -271,7 +271,7 @@ If you have a large number of users or items, you can use an approximate nearest Add this line to your application’s Gemfile: ```ruby -gem 'faiss' +gem "faiss" ``` Speed up the `user_recs` method with: From 555e4824d6a0337d8ade6a6924f06ac0bbd6572a Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Tue, 22 Mar 2022 12:00:14 -0700 Subject: [PATCH 100/158] Version bump to 0.2.9 [skip ci] --- CHANGELOG.md | 2 +- lib/disco/version.rb | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 4ea4144..1218c83 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,4 @@ -## 0.2.9 (unreleased) +## 0.2.9 (2022-03-22) - Fixed error with `load_movielens` diff --git a/lib/disco/version.rb b/lib/disco/version.rb index 3bbfdc2..52dd2dd 100644 --- a/lib/disco/version.rb +++ b/lib/disco/version.rb @@ -1,3 +1,3 @@ module Disco - VERSION = "0.2.8" + VERSION = "0.2.9" end From 694a92e60d95d015a7db614c1394ad001b09a959 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Tue, 22 Mar 2022 12:09:18 -0700 Subject: [PATCH 101/158] Dropped support for Ruby < 2.6 [skip ci] --- CHANGELOG.md | 4 ++++ disco.gemspec | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 1218c83..6aa3ac0 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,7 @@ +## 0.3.0 (unreleased) + +- Dropped support for Ruby < 2.6 + ## 0.2.9 (2022-03-22) - Fixed error with `load_movielens` diff --git a/disco.gemspec b/disco.gemspec index cdebe15..fcf8a38 100644 --- a/disco.gemspec +++ b/disco.gemspec @@ -13,7 +13,7 @@ Gem::Specification.new do |spec| spec.files = Dir["*.{md,txt}", "{app,lib}/**/*"] spec.require_path = "lib" - spec.required_ruby_version = ">= 2.4" + spec.required_ruby_version = ">= 2.6" spec.add_dependency "libmf", ">= 0.2.0" spec.add_dependency "numo-narray" From e17ba9cb27b62b65ae2ed8e40bfbc94f7a8b5b19 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Tue, 22 Mar 2022 12:19:52 -0700 Subject: [PATCH 102/158] Removed dependency on wilson_score gem for top_items --- CHANGELOG.md | 1 + Gemfile | 1 - README.md | 2 +- lib/disco/recommender.rb | 27 ++++++++++----------------- 4 files changed, 12 insertions(+), 19 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 6aa3ac0..c46c796 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,6 @@ ## 0.3.0 (unreleased) +- Removed dependency on `wilson_score` gem for `top_items` - Dropped support for Ruby < 2.6 ## 0.2.9 (2022-03-22) diff --git a/Gemfile b/Gemfile index 0972477..4bc9aca 100644 --- a/Gemfile +++ b/Gemfile @@ -11,4 +11,3 @@ gem "matrix" # for daru gem "rover-df" gem "ngt", ">= 0.3.0" gem "faiss" -gem "wilson_score" diff --git a/README.md b/README.md index a7168e1..30fe20a 100644 --- a/README.md +++ b/README.md @@ -242,7 +242,7 @@ recommender.fit(data) recommender.top_items ``` -This uses [Wilson score](https://www.evanmiller.org/how-not-to-sort-by-average-rating.html) for explicit feedback (add [wilson_score](https://github.com/instacart/wilson_score) to your application’s Gemfile) and item frequency for implicit feedback. +This uses [Wilson score](https://www.evanmiller.org/how-not-to-sort-by-average-rating.html) for explicit feedback and item frequency for implicit feedback. ## Data diff --git a/lib/disco/recommender.rb b/lib/disco/recommender.rb index c25f927..24c5c42 100644 --- a/lib/disco/recommender.rb +++ b/lib/disco/recommender.rb @@ -183,27 +183,20 @@ def top_items(count: 5) if @implicit scores = Numo::UInt64.cast(@item_count) else - require "wilson_score" + min_rating = @min_rating - range = - if @min_rating == @max_rating - # TODO remove temp fix - (@min_rating - 1)..@max_rating - else - @min_rating..@max_rating - end - scores = Numo::DFloat.cast(@item_sum.zip(@item_count).map { |s, c| WilsonScore.rating_lower_bound(s / c, c, range) }) + # TODO remove temp fix + min_rating -= 1 if @min_rating == @max_rating - # TODO uncomment in 0.3.0 # wilson score with continuity correction # https://en.wikipedia.org/wiki/Binomial_proportion_confidence_interval#Wilson_score_interval_with_continuity_correction - # z = 1.96 # 95% confidence - # range = @max_rating - @min_rating - # n = Numo::DFloat.cast(@item_count) - # phat = (Numo::DFloat.cast(@item_sum) - (@min_rating * n)) / range / n - # phat = (phat - (1 / (2 * n))).clip(0, nil) # continuity correction - # scores = (phat + z**2 / (2 * n) - z * Numo::DFloat::Math.sqrt((phat * (1 - phat) + z**2 / (4 * n)) / n)) / (1 + z**2 / n) - # scores = scores * range + @min_rating + z = 1.96 # 95% confidence + range = @max_rating - @min_rating + n = Numo::DFloat.cast(@item_count) + phat = (Numo::DFloat.cast(@item_sum) - (min_rating * n)) / range / n + phat = (phat - (1 / (2 * n))).clip(0, nil) # continuity correction + scores = (phat + z**2 / (2 * n) - z * Numo::DFloat::Math.sqrt((phat * (1 - phat) + z**2 / (4 * n)) / n)) / (1 + z**2 / n) + scores = scores * range + min_rating end indexes = scores.sort_index.reverse From a4ad12416e42d9e2ba4dc38bfc5fc71d726502cf Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Tue, 22 Mar 2022 12:21:23 -0700 Subject: [PATCH 103/158] Updated comment [skip ci] --- lib/disco/recommender.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/disco/recommender.rb b/lib/disco/recommender.rb index 24c5c42..5718514 100644 --- a/lib/disco/recommender.rb +++ b/lib/disco/recommender.rb @@ -269,7 +269,7 @@ def create_index(factors, library:) # inner product is cosine similarity with normalized vectors # https://github.com/facebookresearch/faiss/issues/95 # - # TODO use non-exact index in 0.3.0 + # TODO add option for index type # https://github.com/facebookresearch/faiss/wiki/Faiss-indexes # index = Faiss::IndexHNSWFlat.new(factors.shape[1], 32, :inner_product) index = Faiss::IndexFlatIP.new(factors.shape[1]) From 01cbd20340abafc7eed9ea581f30817e978b4067 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Tue, 22 Mar 2022 12:28:38 -0700 Subject: [PATCH 104/158] Added test for similar_users [skip ci] --- test/recommender_test.rb | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/test/recommender_test.rb b/test/recommender_test.rb index 837b01b..99bfdd8 100644 --- a/test/recommender_test.rb +++ b/test/recommender_test.rb @@ -106,6 +106,15 @@ def test_item_recs_same_score assert_equal ["B", "C"], recommender.item_recs("A").map { |r| r[:item_id] } end + def test_similar_users + data = Disco.load_movielens + recommender = Disco::Recommender.new(factors: 20, verbose: false) + recommender.fit(data) + + refute_empty recommender.similar_users(data.first[:user_id]) + assert_empty recommender.similar_users("missing") + end + def test_top_items_explicit data = Disco.load_movielens recommender = Disco::Recommender.new(factors: 20, top_items: true) From ba09dcd9e877623d4e2d219e59c0ebc1bfbeecab Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Tue, 22 Mar 2022 12:30:02 -0700 Subject: [PATCH 105/158] Improved test [skip ci] --- test/recommender_test.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/recommender_test.rb b/test/recommender_test.rb index 99bfdd8..60d77ed 100644 --- a/test/recommender_test.rb +++ b/test/recommender_test.rb @@ -108,7 +108,7 @@ def test_item_recs_same_score def test_similar_users data = Disco.load_movielens - recommender = Disco::Recommender.new(factors: 20, verbose: false) + recommender = Disco::Recommender.new(factors: 20) recommender.fit(data) refute_empty recommender.similar_users(data.first[:user_id]) From 0d03738f6d0b90a03263a1637c16d377779cbfcd Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Tue, 22 Mar 2022 12:30:58 -0700 Subject: [PATCH 106/158] Changed item_id to user_id for similar_users --- CHANGELOG.md | 1 + lib/disco/recommender.rb | 9 +++------ 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index c46c796..d625ab4 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,6 @@ ## 0.3.0 (unreleased) +- Changed `item_id` to `user_id` for `similar_users` - Removed dependency on `wilson_score` gem for `top_items` - Dropped support for Ruby < 2.6 diff --git a/lib/disco/recommender.rb b/lib/disco/recommender.rb index 5718514..d759c43 100644 --- a/lib/disco/recommender.rb +++ b/lib/disco/recommender.rb @@ -167,13 +167,13 @@ def user_recs(user_id, count: 5, item_ids: nil) def similar_items(item_id, count: 5) check_fit - similar(item_id, @item_map, normalized_item_factors, count, @similar_items_index) + similar(item_id, :item_id, @item_map, normalized_item_factors, count, @similar_items_index) end alias_method :item_recs, :similar_items def similar_users(user_id, count: 5) check_fit - similar(user_id, @user_map, normalized_user_factors, count, @similar_users_index) + similar(user_id, :user_id, @user_map, normalized_user_factors, count, @similar_users_index) end def top_items(count: 5) @@ -311,7 +311,7 @@ def normalize(factors) factors / norms.expand_dims(1) end - def similar(id, map, norm_factors, count, index) + def similar(id, key, map, norm_factors, count, index) i = map[id] if i && norm_factors.shape[0] > 1 @@ -335,9 +335,6 @@ def similar(id, map, norm_factors, count, index) keys = map.keys - # TODO use user_id for similar_users in 0.3.0 - key = :item_id - result = [] # items can have the same score # so original item may not be at index 0 From 7d76ffe5226f3de4b90fac4b29ecf83cc4be6cdb Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Tue, 22 Mar 2022 12:32:21 -0700 Subject: [PATCH 107/158] Use fetch for item_id and score [skip ci] --- lib/disco/model.rb | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/lib/disco/model.rb b/lib/disco/model.rb index 55e4af1..82bb259 100644 --- a/lib/disco/model.rb +++ b/lib/disco/model.rb @@ -10,10 +10,9 @@ def has_recommended(name, class_name: nil) has_many :"recommended_#{name}", -> { where("disco_recommendations.context = ?", name).order("disco_recommendations.score DESC") }, through: :recommendations, source: :item, source_type: class_name - # TODO use fetch for item_id and score in 0.3.0 define_method("update_recommended_#{name}") do |items| now = Time.now - items = items.map { |item| {subject_type: model_name.name, subject_id: id, item_type: class_name, item_id: item[:item_id], context: name, score: item[:score], created_at: now, updated_at: now} } + items = items.map { |item| {subject_type: model_name.name, subject_id: id, item_type: class_name, item_id: item.fetch(:item_id), context: name, score: item.fetch(:score), created_at: now, updated_at: now} } self.class.transaction do recommendations.where(context: name).delete_all From f25709b6c54999439fe153c50432ad0f379703c1 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Tue, 22 Mar 2022 12:37:00 -0700 Subject: [PATCH 108/158] Changed warning to an error when value passed to fit --- CHANGELOG.md | 1 + lib/disco/recommender.rb | 2 +- test/recommender_test.rb | 4 ++-- 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index d625ab4..22ec96c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,7 @@ ## 0.3.0 (unreleased) - Changed `item_id` to `user_id` for `similar_users` +- Changed warning to an error when `value` passed to `fit` - Removed dependency on `wilson_score` gem for `top_items` - Dropped support for Ruby < 2.6 diff --git a/lib/disco/recommender.rb b/lib/disco/recommender.rb index d759c43..fa9f9b6 100644 --- a/lib/disco/recommender.rb +++ b/lib/disco/recommender.rb @@ -23,7 +23,7 @@ def fit(train_set, validation_set: nil) @implicit = !train_set.any? { |v| v[:rating] } if @implicit && train_set.any? { |v| v[:value] } - warn "[disco] WARNING: Passing `:value` with implicit feedback has no effect on recommendations and can be removed. Earlier versions of the library incorrectly stated this was used." + raise ArgumentError, "Passing `:value` with implicit feedback has no effect on recommendations and should be removed. Earlier versions of the library incorrectly stated this was used." end # TODO improve performance diff --git a/test/recommender_test.rb b/test/recommender_test.rb index 60d77ed..56579b1 100644 --- a/test/recommender_test.rb +++ b/test/recommender_test.rb @@ -335,10 +335,10 @@ def test_invalid_rating_validation_set def test_value recommender = Disco::Recommender.new - _, stderr = capture_io do + error = assert_raises(ArgumentError) do recommender.fit([{user_id: 1, item_id: 1, value: 5}]) end - assert_match "[disco] WARNING: Passing `:value` with implicit feedback has no effect on recommendations", stderr + assert_match "Passing `:value` with implicit feedback has no effect on recommendations", error.message end def test_multiple_user_item From f9b53dbee7fd9093d00820e010c4a243a77dc726 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Tue, 22 Mar 2022 12:40:41 -0700 Subject: [PATCH 109/158] Fixed tests --- test/optimize_test.rb | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/test/optimize_test.rb b/test/optimize_test.rb index ea3a545..c4f707d 100644 --- a/test/optimize_test.rb +++ b/test/optimize_test.rb @@ -56,7 +56,7 @@ def test_optimize_similar_users recs = recommender.similar_users(1) - assert_equal original_recs.map { |v| v[:item_id] }, recs.map { |v| v[:item_id] } + assert_equal original_recs.map { |v| v[:user_id] }, recs.map { |v| v[:user_id] } original_recs.zip(recs).each do |exp, act| assert_in_delta exp[:score], act[:score] end @@ -98,11 +98,11 @@ def test_optimize_similar_users_ngt recs = recommender.similar_users(1, count: 10) # won't match exactly due to ANN - matching_ids = original_recs.map { |v| v[:item_id] } & recs.map { |v| v[:item_id] } + matching_ids = original_recs.map { |v| v[:user_id] } & recs.map { |v| v[:user_id] } assert_includes 8..10, matching_ids.size - matching_ids.each do |item_id| - exp = original_recs.find { |v| v[:item_id] == item_id } - act = recs.find { |v| v[:item_id] == item_id } + matching_ids.each do |user_id| + exp = original_recs.find { |v| v[:user_id] == user_id } + act = recs.find { |v| v[:user_id] == user_id } assert_in_delta exp[:score], act[:score] end assert_equal 10, recs.size From b412475e342b3681b6220cc9285c63b018c39a68 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Tue, 22 Mar 2022 12:50:42 -0700 Subject: [PATCH 110/158] Changed to use Faiss over NGT for optimize_item_recs and optimize_similar_users when both are installed --- CHANGELOG.md | 1 + lib/disco/recommender.rb | 3 +-- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 22ec96c..17a9d3f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,7 @@ - Changed `item_id` to `user_id` for `similar_users` - Changed warning to an error when `value` passed to `fit` +- Changed to use Faiss over NGT for `optimize_item_recs` and `optimize_similar_users` when both are installed - Removed dependency on `wilson_score` gem for `top_items` - Dropped support for Ruby < 2.6 diff --git a/lib/disco/recommender.rb b/lib/disco/recommender.rb index fa9f9b6..029bdc5 100644 --- a/lib/disco/recommender.rb +++ b/lib/disco/recommender.rb @@ -259,8 +259,7 @@ def inspect # factors should already be normalized for similar users/items def create_index(factors, library:) - # TODO make Faiss the default in 0.3.0 - library ||= defined?(Faiss) && !defined?(Ngt) ? "faiss" : "ngt" + library ||= defined?(Ngt) && !defined?(Faiss) ? "ngt" : "faiss" case library when "faiss" From f4aa74585f72c5c03700b3072782b531ab0f9856 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Tue, 22 Mar 2022 12:56:53 -0700 Subject: [PATCH 111/158] Updated license year [skip ci] --- LICENSE.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/LICENSE.txt b/LICENSE.txt index bc58858..a8e7f50 100644 --- a/LICENSE.txt +++ b/LICENSE.txt @@ -1,4 +1,4 @@ -Copyright (c) 2019-2021 Andrew Kane +Copyright (c) 2019-2022 Andrew Kane MIT License From c7cd3fc7b4dced1be7f24437f19c55c9a234cf46 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Tue, 22 Mar 2022 12:57:17 -0700 Subject: [PATCH 112/158] Version bump to 0.3.0 [skip ci] --- CHANGELOG.md | 2 +- lib/disco/version.rb | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 17a9d3f..5923404 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,4 @@ -## 0.3.0 (unreleased) +## 0.3.0 (2022-03-22) - Changed `item_id` to `user_id` for `similar_users` - Changed warning to an error when `value` passed to `fit` diff --git a/lib/disco/version.rb b/lib/disco/version.rb index 52dd2dd..f996fec 100644 --- a/lib/disco/version.rb +++ b/lib/disco/version.rb @@ -1,3 +1,3 @@ module Disco - VERSION = "0.2.9" + VERSION = "0.3.0" end From 15fafdd19d95d8273022492628a946de19dad708 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sun, 10 Jul 2022 13:40:56 -0400 Subject: [PATCH 113/158] Added support for JSON serialization --- CHANGELOG.md | 4 +++ lib/disco/recommender.rb | 66 ++++++++++++++++++++++++++++++++++++++++ test/recommender_test.rb | 6 ++++ 3 files changed, 76 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 5923404..6b1270e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,7 @@ +## 0.3.1 (unreleased) + +- Added support for JSON serialization + ## 0.3.0 (2022-03-22) - Changed `item_id` to `user_id` for `similar_users` diff --git a/lib/disco/recommender.rb b/lib/disco/recommender.rb index 029bdc5..9997fc4 100644 --- a/lib/disco/recommender.rb +++ b/lib/disco/recommender.rb @@ -255,6 +255,46 @@ def inspect to_s # for now end + def to_json + require "base64" + require "json" + + obj = { + implicit: @implicit, + user_ids: @user_map.keys, + item_ids: @item_map.keys, + rated: @user_map.map { |_, u| (@rated[u] || {}).keys }, + global_mean: @global_mean, + user_factors: Base64.strict_encode64(@user_factors.to_binary), + item_factors: Base64.strict_encode64(@item_factors.to_binary), + factors: @factors, + epochs: @epochs, + verbose: @verbose + } + + unless @implicit + obj[:min_rating] = @min_rating + obj[:max_rating] = @max_rating + end + + if @top_items + obj[:item_count] = @item_count + obj[:item_sum] = @item_sum + end + + JSON.generate(obj) + end + + def self.load_json(json) + require "json" + + obj = JSON.parse(json) + + recommender = new + recommender.send(:json_load, obj) + recommender + end + private # factors should already be normalized for similar users/items @@ -434,5 +474,31 @@ def marshal_load(obj) @item_sum = obj[:item_sum] end end + + def json_load(obj) + require "base64" + + @implicit = obj["implicit"] + @user_map = obj["user_ids"].map.with_index.to_h + @item_map = obj["item_ids"].map.with_index.to_h + @rated = obj["rated"].map.with_index.to_h { |r, i| [i, r.to_h { |v| [v, true] }] } + @global_mean = obj["global_mean"].to_f + @factors = obj["factors"].to_i + @user_factors = Numo::SFloat.from_binary(Base64.strict_decode64(obj["user_factors"]), [@user_map.size, @factors]) + @item_factors = Numo::SFloat.from_binary(Base64.strict_decode64(obj["item_factors"]), [@item_map.size, @factors]) + @epochs = obj["epochs"].to_i + @verbose = obj["verbose"] + + unless @implicit + @min_rating = obj["min_rating"] + @max_rating = obj["max_rating"] + end + + @top_items = obj.key?("item_count") + if @top_items + @item_count = obj["item_count"] + @item_sum = obj["item_sum"] + end + end end end diff --git a/test/recommender_test.rb b/test/recommender_test.rb index 56579b1..7139101 100644 --- a/test/recommender_test.rb +++ b/test/recommender_test.rb @@ -14,6 +14,9 @@ def test_explicit dump = File.binread(path) recommender = Marshal.load(dump) + dump = recommender.to_json + recommender = Disco::Recommender.load_json(dump) + assert_equal [1664, 20], recommender.item_factors.shape assert_equal [943, 20], recommender.user_factors.shape @@ -55,6 +58,9 @@ def test_implicit dump = File.binread(path) recommender = Marshal.load(dump) + dump = recommender.to_json + recommender = Disco::Recommender.load_json(dump) + assert_equal [1664, 20], recommender.item_factors.shape assert_equal [943, 20], recommender.user_factors.shape assert_equal 0, recommender.global_mean From 91c81eaf1e3cfe3dd3cd0c646dc0763a0a7a4829 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sun, 10 Jul 2022 14:11:12 -0400 Subject: [PATCH 114/158] Added note about serialized recommender [skip ci] --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index 30fe20a..bdf20c0 100644 --- a/README.md +++ b/README.md @@ -189,6 +189,8 @@ File.binwrite("recommender.bin", bin) > You can save it to a file, database, or any other storage system +The serialized recommender includes user activity from the training data (to avoid recommending previously rated items), so be sure to protect it. + Load a recommender ```ruby From e629c890c67bb9fb8e7fe83ef0891b83f3e7a748 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sun, 10 Jul 2022 14:26:48 -0400 Subject: [PATCH 115/158] Version bump to 0.3.1 [skip ci] --- CHANGELOG.md | 2 +- README.md | 10 +++++----- lib/disco/version.rb | 2 +- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 6b1270e..303db84 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,4 @@ -## 0.3.1 (unreleased) +## 0.3.1 (2022-07-10) - Added support for JSON serialization diff --git a/README.md b/README.md index bdf20c0..41217b1 100644 --- a/README.md +++ b/README.md @@ -183,19 +183,19 @@ For Rails < 6, speed up inserts by adding [activerecord-import](https://github.c If you’d prefer to perform recommendations on-the-fly, store the recommender ```ruby -bin = Marshal.dump(recommender) -File.binwrite("recommender.bin", bin) +json = recommender.to_json +File.write("recommender.json", json) ``` -> You can save it to a file, database, or any other storage system +> You can save it to a file, database, or any other storage system. Also, user and item IDs should be integers or strings for this. The serialized recommender includes user activity from the training data (to avoid recommending previously rated items), so be sure to protect it. Load a recommender ```ruby -bin = File.binread("recommender.bin") -recommender = Marshal.load(bin) +json = File.read("recommender.json") +recommender = Disco::Recommender.load_json(json) ``` Alternatively, you can store only the factors and use a library like [Neighbor](https://github.com/ankane/neighbor). See the [examples](https://github.com/ankane/neighbor/tree/master/examples). diff --git a/lib/disco/version.rb b/lib/disco/version.rb index f996fec..54b4458 100644 --- a/lib/disco/version.rb +++ b/lib/disco/version.rb @@ -1,3 +1,3 @@ module Disco - VERSION = "0.3.0" + VERSION = "0.3.1" end From 3b432e43c679a36d3879d6a0e7c17597cc7d2786 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sun, 10 Jul 2022 14:29:12 -0400 Subject: [PATCH 116/158] Updated readme [skip ci] --- README.md | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/README.md b/README.md index 41217b1..ff373d4 100644 --- a/README.md +++ b/README.md @@ -187,9 +187,7 @@ json = recommender.to_json File.write("recommender.json", json) ``` -> You can save it to a file, database, or any other storage system. Also, user and item IDs should be integers or strings for this. - -The serialized recommender includes user activity from the training data (to avoid recommending previously rated items), so be sure to protect it. +The serialized recommender includes user activity from the training data (to avoid recommending previously rated items), so be sure to protect it. You can save it to a file, database, or any other storage system. Also, user and item IDs should be integers or strings for this. Load a recommender From 6bba087f688b6ad57599bf7117bc33b11575777c Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sun, 10 Jul 2022 14:44:13 -0400 Subject: [PATCH 117/158] Added link to Trove [skip ci] --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index ff373d4..a0ec656 100644 --- a/README.md +++ b/README.md @@ -187,7 +187,7 @@ json = recommender.to_json File.write("recommender.json", json) ``` -The serialized recommender includes user activity from the training data (to avoid recommending previously rated items), so be sure to protect it. You can save it to a file, database, or any other storage system. Also, user and item IDs should be integers or strings for this. +The serialized recommender includes user activity from the training data (to avoid recommending previously rated items), so be sure to protect it. You can save it to a file, database, or any other storage system, or use a tool like [Trove](https://github.com/ankane/trove). Also, user and item IDs should be integers or strings for this. Load a recommender From 1f6b45bcd73f46186c797a3b8b0b06e2e943744d Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sun, 7 Aug 2022 02:09:05 -0700 Subject: [PATCH 118/158] Improved tests [skip ci] --- test/recommender_test.rb | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/test/recommender_test.rb b/test/recommender_test.rb index 7139101..aa59fb6 100644 --- a/test/recommender_test.rb +++ b/test/recommender_test.rb @@ -29,6 +29,7 @@ def test_explicit item_ids = recs.map { |r| r[:item_id] } assert_includes item_ids, "Empire Strikes Back, The (1980)" assert_includes item_ids, "Return of the Jedi (1983)" + refute_includes item_ids, "Star Wars (1977)" assert_in_delta 0.9972, recs.first[:score], 0.01 @@ -65,9 +66,11 @@ def test_implicit assert_equal [943, 20], recommender.user_factors.shape assert_equal 0, recommender.global_mean - recs = recommender.item_recs("Star Wars (1977)", count: 10).map { |r| r[:item_id] } - assert_includes recs, "Empire Strikes Back, The (1980)" - assert_includes recs, "Return of the Jedi (1983)" + recs = recommender.item_recs("Star Wars (1977)", count: 10) + item_ids = recs.map { |r| r[:item_id] } + assert_includes item_ids, "Empire Strikes Back, The (1980)" + assert_includes item_ids, "Return of the Jedi (1983)" + refute_includes item_ids, "Star Wars (1977)" end def test_examples From 2b55ff3b4995d0f306cb47fbadb1ad951570d9ee Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sun, 7 Aug 2022 19:54:13 -0700 Subject: [PATCH 119/158] Fixed issue when fit is called multiple times --- CHANGELOG.md | 4 ++++ lib/disco/recommender.rb | 5 +++++ test/recommender_test.rb | 9 +++++++++ 3 files changed, 18 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 303db84..b03a854 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,7 @@ +## 0.3.2 (unreleased) + +- Fixed issue when `fit` is called multiple times + ## 0.3.1 (2022-07-10) - Added support for JSON serialization diff --git a/lib/disco/recommender.rb b/lib/disco/recommender.rb index 9997fc4..4e0222f 100644 --- a/lib/disco/recommender.rb +++ b/lib/disco/recommender.rb @@ -36,6 +36,8 @@ def fit(train_set, validation_set: nil) end end + @user_map = {} + @item_map = {} @rated = Hash.new { |hash, key| hash[key] = {} } input = [] train_set.each do |v| @@ -56,6 +58,9 @@ def fit(train_set, validation_set: nil) # TODO improve performance unless @implicit @min_rating, @max_rating = train_set.minmax_by { |o| o[:rating] }.map { |o| o[:rating] } + else + @min_rating = nil + @max_rating = nil end if @top_items diff --git a/test/recommender_test.rb b/test/recommender_test.rb index aa59fb6..3521a86 100644 --- a/test/recommender_test.rb +++ b/test/recommender_test.rb @@ -372,6 +372,15 @@ def test_not_fit assert_equal "Not fit", error.message end + def test_fit_multiple + recommender = Disco::Recommender.new + recommender.fit([{user_id: 1, item_id: 1, rating: 5}]) + recommender.fit([{user_id: 2, item_id: 2}]) + assert_equal [2], recommender.user_ids + assert_equal [2], recommender.item_ids + assert_operator recommender.predict([{user_id: 2, item_id: 2}])[0], :<, 1.0 + end + def test_rover movielens = Disco.load_movielens From 95b18da296287f9e16d7a5cd2cfe5b692e9aa4ae Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sun, 7 Aug 2022 20:01:37 -0700 Subject: [PATCH 120/158] Updated link [skip ci] --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index a0ec656..8c26d77 100644 --- a/README.md +++ b/README.md @@ -223,7 +223,7 @@ recommender.fit(data, validation_set: validation_set) ## Cold Start -Collaborative filtering suffers from the [cold start problem](https://www.yuspify.com/blog/cold-start-problem-recommender-systems/). It’s unable to make good recommendations without data on a user or item, which is problematic for new users and items. +Collaborative filtering suffers from the [cold start problem](https://en.wikipedia.org/wiki/Cold_start_(recommender_systems)). It’s unable to make good recommendations without data on a user or item, which is problematic for new users and items. ```ruby recommender.user_recs(new_user_id) # returns empty array From 508ceb89a87959d9e48fd931f7a888fe5caf2d4a Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Mon, 26 Sep 2022 22:58:52 -0700 Subject: [PATCH 121/158] Version bump to 0.3.2 [skip ci] --- CHANGELOG.md | 2 +- lib/disco/version.rb | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index b03a854..47ccdbc 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,4 @@ -## 0.3.2 (unreleased) +## 0.3.2 (2022-09-26) - Fixed issue when `fit` is called multiple times diff --git a/lib/disco/version.rb b/lib/disco/version.rb index 54b4458..a2633d1 100644 --- a/lib/disco/version.rb +++ b/lib/disco/version.rb @@ -1,3 +1,3 @@ module Disco - VERSION = "0.3.1" + VERSION = "0.3.2" end From 233111667afecb0415462304e00636063f4e4e75 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Mon, 14 Nov 2022 09:19:22 -0800 Subject: [PATCH 122/158] Updated actions --- .github/workflows/build.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 4a5dcbf..5514b0b 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -14,16 +14,16 @@ jobs: gemfile: gemfiles/activerecord60.gemfile - ruby: 2.6 gemfile: gemfiles/activerecord52.gemfile - runs-on: ubuntu-20.04 + runs-on: ubuntu-latest env: BUNDLE_GEMFILE: ${{ matrix.gemfile }} steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 - uses: ruby/setup-ruby@v1 with: ruby-version: ${{ matrix.ruby }} bundler-cache: true - - uses: actions/cache@v2 + - uses: actions/cache@v3 with: path: ~/.disco key: disco From c7f2e839d5a9c928956ad9ad323f1a91fd46af2c Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Mon, 26 Dec 2022 10:33:41 -0800 Subject: [PATCH 123/158] Added Ruby 3.2 to CI --- .github/workflows/build.yml | 38 +++++++++++++++++++------------------ 1 file changed, 20 insertions(+), 18 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 5514b0b..87fca76 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -6,25 +6,27 @@ jobs: fail-fast: false matrix: include: - - ruby: 3.1 - gemfile: Gemfile - - ruby: "3.0" - gemfile: gemfiles/activerecord61.gemfile - - ruby: 2.7 - gemfile: gemfiles/activerecord60.gemfile - - ruby: 2.6 - gemfile: gemfiles/activerecord52.gemfile + - ruby: 3.2 + gemfile: Gemfile + - ruby: 3.1 + gemfile: Gemfile + - ruby: "3.0" + gemfile: gemfiles/activerecord61.gemfile + - ruby: 2.7 + gemfile: gemfiles/activerecord60.gemfile + - ruby: 2.6 + gemfile: gemfiles/activerecord52.gemfile runs-on: ubuntu-latest env: BUNDLE_GEMFILE: ${{ matrix.gemfile }} steps: - - uses: actions/checkout@v3 - - uses: ruby/setup-ruby@v1 - with: - ruby-version: ${{ matrix.ruby }} - bundler-cache: true - - uses: actions/cache@v3 - with: - path: ~/.disco - key: disco - - run: bundle exec rake test + - uses: actions/checkout@v3 + - uses: ruby/setup-ruby@v1 + with: + ruby-version: ${{ matrix.ruby }} + bundler-cache: true + - uses: actions/cache@v3 + with: + path: ~/.disco + key: disco + - run: bundle exec rake test From ecae29f58603546d4f8b3a52cc6ed9885666b7a2 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Wed, 11 Jan 2023 11:20:52 -0800 Subject: [PATCH 124/158] Removed unused gemfiles [skip ci] --- test/gemfiles/activerecord50.gemfile | 6 ------ test/gemfiles/activerecord51.gemfile | 5 ----- test/gemfiles/activerecord52.gemfile | 5 ----- 3 files changed, 16 deletions(-) delete mode 100644 test/gemfiles/activerecord50.gemfile delete mode 100644 test/gemfiles/activerecord51.gemfile delete mode 100644 test/gemfiles/activerecord52.gemfile diff --git a/test/gemfiles/activerecord50.gemfile b/test/gemfiles/activerecord50.gemfile deleted file mode 100644 index 4c2ad1d..0000000 --- a/test/gemfiles/activerecord50.gemfile +++ /dev/null @@ -1,6 +0,0 @@ -source "https://rubygems.org" - -gemspec path: "../../" - -gem "activerecord", "~> 5.0.0" -gem "sqlite3", "~> 1.3.0" diff --git a/test/gemfiles/activerecord51.gemfile b/test/gemfiles/activerecord51.gemfile deleted file mode 100644 index 11e5fb2..0000000 --- a/test/gemfiles/activerecord51.gemfile +++ /dev/null @@ -1,5 +0,0 @@ -source "https://rubygems.org" - -gemspec path: "../../" - -gem "activerecord", "~> 5.1.0" diff --git a/test/gemfiles/activerecord52.gemfile b/test/gemfiles/activerecord52.gemfile deleted file mode 100644 index 5e35e61..0000000 --- a/test/gemfiles/activerecord52.gemfile +++ /dev/null @@ -1,5 +0,0 @@ -source "https://rubygems.org" - -gemspec path: "../../" - -gem "activerecord", "~> 5.2.0" From 033052a1a38dadb7f8735b9a4c40bdf505964b92 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Wed, 11 Jan 2023 11:26:45 -0800 Subject: [PATCH 125/158] Fixed issue with has_recommended and inheritance with Active Record < 6.1 - fixes #25 --- CHANGELOG.md | 4 ++++ lib/disco/model.rb | 3 ++- test/model_test.rb | 12 ++++++++++++ test/support/active_record.rb | 4 ++++ 4 files changed, 22 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 47ccdbc..f182535 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,7 @@ +## 0.4.0 (unreleased) + +- Fixed issue with `has_recommended` and inheritance with Active Record < 6.1 + ## 0.3.2 (2022-09-26) - Fixed issue when `fit` is called multiple times diff --git a/lib/disco/model.rb b/lib/disco/model.rb index 82bb259..7e4a8d5 100644 --- a/lib/disco/model.rb +++ b/lib/disco/model.rb @@ -2,6 +2,7 @@ module Disco module Model def has_recommended(name, class_name: nil) class_name ||= name.to_s.singularize.camelize + subject_type = model_name.name class_eval do unless reflect_on_association(:recommendations) @@ -12,7 +13,7 @@ def has_recommended(name, class_name: nil) define_method("update_recommended_#{name}") do |items| now = Time.now - items = items.map { |item| {subject_type: model_name.name, subject_id: id, item_type: class_name, item_id: item.fetch(:item_id), context: name, score: item.fetch(:score), created_at: now, updated_at: now} } + items = items.map { |item| {subject_type: subject_type, subject_id: id, item_type: class_name, item_id: item.fetch(:item_id), context: name, score: item.fetch(:score), created_at: now, updated_at: now} } self.class.transaction do recommendations.where(context: name).delete_all diff --git a/test/model_test.rb b/test/model_test.rb index 677ebac..2cfe072 100644 --- a/test/model_test.rb +++ b/test/model_test.rb @@ -12,4 +12,16 @@ def test_recommendations assert_equal products, user.recommended_products.to_a assert_equal [], user.recommended_products_v2.to_a end + + def test_inheritance + user = AdminUser.create! + products = Product.create!([{name: "Product A"}, {name: "Product B"}].shuffle) + user.update_recommended_products([ + {item_id: products.first.id, score: 1}, + {item_id: products.last.id, score: 0.5} + ].shuffle) + assert_equal products.size, user.recommendations.count + assert_equal products, user.recommended_products.to_a + assert_equal [], user.recommended_products_v2.to_a + end end diff --git a/test/support/active_record.rb b/test/support/active_record.rb index 56e09e7..d50a348 100644 --- a/test/support/active_record.rb +++ b/test/support/active_record.rb @@ -10,6 +10,7 @@ ActiveRecord::Schema.define do create_table :users do |t| t.string :name + t.string :type end create_table :products do |t| @@ -30,6 +31,9 @@ class User < ActiveRecord::Base has_recommended :products_v2, class_name: "Product" end +class AdminUser < User +end + class Product < ActiveRecord::Base end From 380f4ec2a7161eac0513b283499aa8f4f8c01bb1 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Wed, 11 Jan 2023 15:03:18 -0800 Subject: [PATCH 126/158] Switched to require_relative --- lib/disco.rb | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/lib/disco.rb b/lib/disco.rb index 966c039..f18d7ee 100644 --- a/lib/disco.rb +++ b/lib/disco.rb @@ -3,13 +3,13 @@ require "numo/narray" # modules -require "disco/data" -require "disco/metrics" -require "disco/recommender" -require "disco/version" +require_relative "disco/data" +require_relative "disco/metrics" +require_relative "disco/recommender" +require_relative "disco/version" # integrations -require "disco/engine" if defined?(Rails) +require_relative "disco/engine" if defined?(Rails) module Disco class Error < StandardError; end @@ -19,7 +19,7 @@ class Error < StandardError; end if defined?(ActiveSupport.on_load) ActiveSupport.on_load(:active_record) do - require "disco/model" + require_relative "disco/model" extend Disco::Model end end From e9c5bd3f8cd57cb3a415d3ca5bbff3e9580fc4b6 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Wed, 11 Jan 2023 15:06:29 -0800 Subject: [PATCH 127/158] Dropped support for Ruby < 2.7 --- .github/workflows/build.yml | 2 +- CHANGELOG.md | 1 + disco.gemspec | 4 ++-- 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 87fca76..24392ef 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -14,7 +14,7 @@ jobs: gemfile: gemfiles/activerecord61.gemfile - ruby: 2.7 gemfile: gemfiles/activerecord60.gemfile - - ruby: 2.6 + - ruby: 2.7 gemfile: gemfiles/activerecord52.gemfile runs-on: ubuntu-latest env: diff --git a/CHANGELOG.md b/CHANGELOG.md index f182535..fb90c38 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,7 @@ ## 0.4.0 (unreleased) - Fixed issue with `has_recommended` and inheritance with Active Record < 6.1 +- Dropped support for Ruby < 2.7 ## 0.3.2 (2022-09-26) diff --git a/disco.gemspec b/disco.gemspec index fcf8a38..81cfee5 100644 --- a/disco.gemspec +++ b/disco.gemspec @@ -13,8 +13,8 @@ Gem::Specification.new do |spec| spec.files = Dir["*.{md,txt}", "{app,lib}/**/*"] spec.require_path = "lib" - spec.required_ruby_version = ">= 2.6" + spec.required_ruby_version = ">= 2.7" - spec.add_dependency "libmf", ">= 0.2.0" + spec.add_dependency "libmf", ">= 0.2" spec.add_dependency "numo-narray" end From a75e1c7c5118e868cded04dadcd7a564732d8911 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Mon, 30 Jan 2023 00:46:26 -0800 Subject: [PATCH 128/158] Dropped support for Rails < 6 --- .github/workflows/build.yml | 2 -- CHANGELOG.md | 4 ++-- README.md | 2 -- gemfiles/activerecord52.gemfile | 13 ------------- lib/disco/model.rb | 14 +++++--------- 5 files changed, 7 insertions(+), 28 deletions(-) delete mode 100644 gemfiles/activerecord52.gemfile diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 24392ef..35e15bb 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -14,8 +14,6 @@ jobs: gemfile: gemfiles/activerecord61.gemfile - ruby: 2.7 gemfile: gemfiles/activerecord60.gemfile - - ruby: 2.7 - gemfile: gemfiles/activerecord52.gemfile runs-on: ubuntu-latest env: BUNDLE_GEMFILE: ${{ matrix.gemfile }} diff --git a/CHANGELOG.md b/CHANGELOG.md index fb90c38..5d4c7de 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,7 +1,7 @@ ## 0.4.0 (unreleased) -- Fixed issue with `has_recommended` and inheritance with Active Record < 6.1 -- Dropped support for Ruby < 2.7 +- Fixed issue with `has_recommended` and inheritance with Rails < 6.1 +- Dropped support for Ruby < 2.7 and Rails < 6 ## 0.3.2 (2022-09-26) diff --git a/README.md b/README.md index 8c26d77..4ce0a2f 100644 --- a/README.md +++ b/README.md @@ -176,8 +176,6 @@ user.update_recommended_products_v2(recs) user.recommended_products_v2 ``` -For Rails < 6, speed up inserts by adding [activerecord-import](https://github.com/zdennis/activerecord-import) to your app. - ## Storing Recommenders If you’d prefer to perform recommendations on-the-fly, store the recommender diff --git a/gemfiles/activerecord52.gemfile b/gemfiles/activerecord52.gemfile deleted file mode 100644 index 07f0685..0000000 --- a/gemfiles/activerecord52.gemfile +++ /dev/null @@ -1,13 +0,0 @@ -source "https://rubygems.org" - -gemspec path: ".." - -gem "rake" -gem "minitest", ">= 5" -gem "activerecord", "~> 5.2.0" -gem "sqlite3" -gem "daru" -gem "rover-df" -gem "ngt", ">= 0.3.0" -gem "faiss" -gem "wilson_score" diff --git a/lib/disco/model.rb b/lib/disco/model.rb index 7e4a8d5..a9e7e8a 100644 --- a/lib/disco/model.rb +++ b/lib/disco/model.rb @@ -1,6 +1,10 @@ module Disco module Model def has_recommended(name, class_name: nil) + if ActiveRecord::VERSION::MAJOR < 6 + raise Disco::Error, "Requires Active Record 6+" + end + class_name ||= name.to_s.singularize.camelize subject_type = model_name.name @@ -19,15 +23,7 @@ def has_recommended(name, class_name: nil) recommendations.where(context: name).delete_all if items.any? - if recommendations.respond_to?(:insert_all!) - # Rails 6 - recommendations.insert_all!(items) - elsif recommendations.respond_to?(:bulk_import!) - # activerecord-import - recommendations.bulk_import!(items, validate: false) - else - recommendations.create!([items]) - end + recommendations.insert_all!(items) end end end From df5eb2b8e5b5451204aa2b516dfd89436c5e3682 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Mon, 30 Jan 2023 01:00:20 -0800 Subject: [PATCH 129/158] Deprecated marshal serialization --- CHANGELOG.md | 1 + lib/disco/recommender.rb | 2 ++ test/recommender_test.rb | 36 ++++++++++++++++++++++++++++++------ 3 files changed, 33 insertions(+), 6 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 5d4c7de..a6c4a71 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,7 @@ ## 0.4.0 (unreleased) - Fixed issue with `has_recommended` and inheritance with Rails < 6.1 +- Deprecated marshal serialization - Dropped support for Ruby < 2.7 and Rails < 6 ## 0.3.2 (2022-09-26) diff --git a/lib/disco/recommender.rb b/lib/disco/recommender.rb index 4e0222f..bd66f69 100644 --- a/lib/disco/recommender.rb +++ b/lib/disco/recommender.rb @@ -430,6 +430,8 @@ def to_dataset(dataset) end def marshal_dump + warn "[disco] Marshal serialization is deprecated - use JSON instead" + obj = { implicit: @implicit, user_map: @user_map, diff --git a/test/recommender_test.rb b/test/recommender_test.rb index 3521a86..44b1c21 100644 --- a/test/recommender_test.rb +++ b/test/recommender_test.rb @@ -8,8 +8,10 @@ def test_explicit path = "#{Dir.mktmpdir}/recommender.bin" - dump = Marshal.dump(recommender) - File.binwrite(path, dump) + assert_deprecated do + dump = Marshal.dump(recommender) + File.binwrite(path, dump) + end dump = File.binread(path) recommender = Marshal.load(dump) @@ -53,8 +55,10 @@ def test_implicit path = "#{Dir.mktmpdir}/recommender.bin" - dump = Marshal.dump(recommender) - File.binwrite(path, dump) + assert_deprecated do + dump = Marshal.dump(recommender) + File.binwrite(path, dump) + end dump = File.binread(path) recommender = Marshal.load(dump) @@ -131,7 +135,13 @@ def test_top_items_explicit top_items = recommender.top_items assert_equal top_items, recommender.user_recs("unknown") - recommender = Marshal.load(Marshal.dump(recommender)) + assert_deprecated do + recommender = Marshal.load(Marshal.dump(recommender)) + end + assert_equal top_items, recommender.top_items + assert_equal top_items, recommender.user_recs("unknown") + + recommender = Disco::Recommender.load_json(recommender.to_json) assert_equal top_items, recommender.top_items assert_equal top_items, recommender.user_recs("unknown") end @@ -144,7 +154,13 @@ def test_top_items_implicit top_items = recommender.top_items assert_equal top_items, recommender.user_recs("unknown") - recommender = Marshal.load(Marshal.dump(recommender)) + assert_deprecated do + recommender = Marshal.load(Marshal.dump(recommender)) + end + assert_equal top_items, recommender.top_items + assert_equal top_items, recommender.user_recs("unknown") + + recommender = Disco::Recommender.load_json(recommender.to_json) assert_equal top_items, recommender.top_items assert_equal top_items, recommender.user_recs("unknown") end @@ -414,4 +430,12 @@ def test_daru # original data frame not modified assert_equal ["user_id", "item_id", "rating"], data.vectors.to_a end + + private + + def assert_deprecated + assert_output nil, /is deprecated/ do + yield + end + end end From e05b8beaa48825586946cac3ab4349ab0f6754c9 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Mon, 30 Jan 2023 01:07:02 -0800 Subject: [PATCH 130/158] Added deprecation warning to load --- lib/disco/recommender.rb | 2 ++ test/recommender_test.rb | 12 ++++++++---- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/lib/disco/recommender.rb b/lib/disco/recommender.rb index bd66f69..89934e4 100644 --- a/lib/disco/recommender.rb +++ b/lib/disco/recommender.rb @@ -459,6 +459,8 @@ def marshal_dump end def marshal_load(obj) + warn "[disco] Marshal serialization is deprecated - use JSON instead" + @implicit = obj[:implicit] @user_map = obj[:user_map] @item_map = obj[:item_map] diff --git a/test/recommender_test.rb b/test/recommender_test.rb index 44b1c21..214b531 100644 --- a/test/recommender_test.rb +++ b/test/recommender_test.rb @@ -13,8 +13,10 @@ def test_explicit File.binwrite(path, dump) end - dump = File.binread(path) - recommender = Marshal.load(dump) + assert_deprecated do + dump = File.binread(path) + recommender = Marshal.load(dump) + end dump = recommender.to_json recommender = Disco::Recommender.load_json(dump) @@ -60,8 +62,10 @@ def test_implicit File.binwrite(path, dump) end - dump = File.binread(path) - recommender = Marshal.load(dump) + assert_deprecated do + dump = File.binread(path) + recommender = Marshal.load(dump) + end dump = recommender.to_json recommender = Disco::Recommender.load_json(dump) From 51d6d90d95281587502207ef2f989321cba9dd2b Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Mon, 30 Jan 2023 01:08:15 -0800 Subject: [PATCH 131/158] Version bump to 0.4.0 [skip ci] --- CHANGELOG.md | 2 +- lib/disco/version.rb | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index a6c4a71..3eb79a3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,4 @@ -## 0.4.0 (unreleased) +## 0.4.0 (2023-01-30) - Fixed issue with `has_recommended` and inheritance with Rails < 6.1 - Deprecated marshal serialization diff --git a/lib/disco/version.rb b/lib/disco/version.rb index a2633d1..df1c6a9 100644 --- a/lib/disco/version.rb +++ b/lib/disco/version.rb @@ -1,3 +1,3 @@ module Disco - VERSION = "0.3.2" + VERSION = "0.4.0" end From 35d774006bb8b1d4bada4f05161057a5c80d808f Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Tue, 25 Jul 2023 17:27:42 -0700 Subject: [PATCH 132/158] Added Active Record 7.1 to CI --- .github/workflows/build.yml | 2 +- gemfiles/activerecord71.gemfile | 14 ++++++++++++++ 2 files changed, 15 insertions(+), 1 deletion(-) create mode 100644 gemfiles/activerecord71.gemfile diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 35e15bb..36baca6 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -7,7 +7,7 @@ jobs: matrix: include: - ruby: 3.2 - gemfile: Gemfile + gemfile: gemfiles/activerecord71.gemfile - ruby: 3.1 gemfile: Gemfile - ruby: "3.0" diff --git a/gemfiles/activerecord71.gemfile b/gemfiles/activerecord71.gemfile new file mode 100644 index 0000000..f413465 --- /dev/null +++ b/gemfiles/activerecord71.gemfile @@ -0,0 +1,14 @@ +source "https://rubygems.org" + +gemspec path: ".." + +gem "rake" +gem "minitest", ">= 5" +gem "activerecord", github: "rails/rails" +gem "sqlite3" +gem "daru" +gem "matrix" # for daru +gem "rover-df" +gem "ngt", ">= 0.3.0" +gem "faiss" +gem "wilson_score" From ad5c272a22a0c9ed0cb698e325ae755e9a63decc Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sun, 10 Sep 2023 11:10:09 -0700 Subject: [PATCH 133/158] Fixed test --- test/recommender_test.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/recommender_test.rb b/test/recommender_test.rb index 214b531..5043e61 100644 --- a/test/recommender_test.rb +++ b/test/recommender_test.rb @@ -263,7 +263,7 @@ def test_user_recs_new_item {user_id: 1, item_id: 1, rating: 5}, {user_id: 2, item_id: 1, rating: 3} ]) - assert_empty [], recommender.user_recs(1, item_ids: [1000]) + assert_empty recommender.user_recs(1, item_ids: [1000]) end def test_predict From 3b7318de508dd33bb84f5a937f1d209b5d239669 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sun, 10 Sep 2023 11:10:55 -0700 Subject: [PATCH 134/158] Updated readme [skip ci] --- README.md | 22 ---------------------- 1 file changed, 22 deletions(-) diff --git a/README.md b/README.md index 4ce0a2f..4473bc8 100644 --- a/README.md +++ b/README.md @@ -329,28 +329,6 @@ Thanks to: - [Implicit](https://github.com/benfred/implicit/) for serving as an initial reference for user and item similarity - [@dasch](https://github.com/dasch) for the gem name -## Upgrading - -### 0.2.7 - -There’s now a warning when passing `:value` with implicit feedback, as this has no effect on recommendations and can be removed. Earlier versions of the library incorrectly stated this was used. - -```ruby -recommender.fit([ - {user_id: 1, item_id: 1, value: 1}, - {user_id: 2, item_id: 1, value: 3} -]) -``` - -to: - -```ruby -recommender.fit([ - {user_id: 1, item_id: 1}, - {user_id: 2, item_id: 1} -]) -``` - ## History View the [changelog](https://github.com/ankane/disco/blob/master/CHANGELOG.md) From 993018d55294a474b72db886dccc84c9f3a53155 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Fri, 15 Sep 2023 10:37:06 -0700 Subject: [PATCH 135/158] Test with Rails 7.1.0.beta1 --- gemfiles/activerecord71.gemfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gemfiles/activerecord71.gemfile b/gemfiles/activerecord71.gemfile index f413465..279241e 100644 --- a/gemfiles/activerecord71.gemfile +++ b/gemfiles/activerecord71.gemfile @@ -4,7 +4,7 @@ gemspec path: ".." gem "rake" gem "minitest", ">= 5" -gem "activerecord", github: "rails/rails" +gem "activerecord", "7.1.0.beta1" gem "sqlite3" gem "daru" gem "matrix" # for daru From f91b3f56a43d4570d7292468d7e87eec07276d61 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Fri, 29 Sep 2023 01:40:25 -0700 Subject: [PATCH 136/158] Updated readme [skip ci] --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 4473bc8..dbe9ddd 100644 --- a/README.md +++ b/README.md @@ -229,8 +229,8 @@ recommender.user_recs(new_user_id) # returns empty array There are a number of ways to deal with this, but here are some common ones: -- For user-based recommendations, show new users the most popular items. -- For item-based recommendations, make content-based recommendations with a gem like [tf-idf-similarity](https://github.com/jpmckinney/tf-idf-similarity). +- For user-based recommendations, show new users the most popular items +- For item-based recommendations, make content-based recommendations with a gem like [tf-idf-similarity](https://github.com/jpmckinney/tf-idf-similarity) Get top items with: From af04f3c0d5f6bda7ff2876eac16bfc10fc34add0 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Fri, 29 Sep 2023 21:15:55 -0700 Subject: [PATCH 137/158] Fixed encoding for MovieLens data --- lib/disco/data.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/disco/data.rb b/lib/disco/data.rb index 025d11a..640053a 100644 --- a/lib/disco/data.rb +++ b/lib/disco/data.rb @@ -9,7 +9,7 @@ def load_movielens file_hash: "06416e597f82b7342361e41163890c81036900f418ad91315590814211dca490") # convert u.item to utf-8 - movies_str = File.read(item_path).encode("UTF-8", "binary", invalid: :replace, undef: :replace, replace: "") + movies_str = File.read(item_path).encode("UTF-8", "ISO-8859-1") movies = {} CSV.parse(movies_str, col_sep: "|") do |row| From a25c95519a9e89fb01c218fa2d938d48f6ccc218 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sun, 1 Oct 2023 13:02:24 -0700 Subject: [PATCH 138/158] Fixed count for all same scores --- lib/disco/recommender.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/disco/recommender.rb b/lib/disco/recommender.rb index 89934e4..3732f0e 100644 --- a/lib/disco/recommender.rb +++ b/lib/disco/recommender.rb @@ -387,7 +387,7 @@ def similar(id, key, map, norm_factors, count, index) result << {key => keys[id], score: predictions[j]} end - result + count ? result.first(count) : result else [] end From 40855e270f8419a6bce686727fffe652555e8712 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sun, 1 Oct 2023 13:03:53 -0700 Subject: [PATCH 139/158] Made code consistent with user_recs --- lib/disco/recommender.rb | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/lib/disco/recommender.rb b/lib/disco/recommender.rb index 3732f0e..8d68269 100644 --- a/lib/disco/recommender.rb +++ b/lib/disco/recommender.rb @@ -386,8 +386,9 @@ def similar(id, key, map, norm_factors, count, index) next if id == i result << {key => keys[id], score: predictions[j]} + break if result.size == count end - count ? result.first(count) : result + result else [] end From 06be249f01025939fc57acbd61b90037b7e36203 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Thu, 5 Oct 2023 19:50:35 -0700 Subject: [PATCH 140/158] Test with Rails 7.1 --- .github/workflows/build.yml | 4 ++-- Gemfile | 2 +- gemfiles/{activerecord71.gemfile => activerecord70.gemfile} | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) rename gemfiles/{activerecord71.gemfile => activerecord70.gemfile} (85%) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 36baca6..20ab04a 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -7,9 +7,9 @@ jobs: matrix: include: - ruby: 3.2 - gemfile: gemfiles/activerecord71.gemfile - - ruby: 3.1 gemfile: Gemfile + - ruby: 3.1 + gemfile: gemfiles/activerecord70.gemfile - ruby: "3.0" gemfile: gemfiles/activerecord61.gemfile - ruby: 2.7 diff --git a/Gemfile b/Gemfile index 4bc9aca..a3e4f27 100644 --- a/Gemfile +++ b/Gemfile @@ -4,7 +4,7 @@ gemspec gem "rake" gem "minitest", ">= 5" -gem "activerecord", "~> 7.0.0" +gem "activerecord", "~> 7.1.0" gem "sqlite3" gem "daru" gem "matrix" # for daru diff --git a/gemfiles/activerecord71.gemfile b/gemfiles/activerecord70.gemfile similarity index 85% rename from gemfiles/activerecord71.gemfile rename to gemfiles/activerecord70.gemfile index 279241e..7fa56a2 100644 --- a/gemfiles/activerecord71.gemfile +++ b/gemfiles/activerecord70.gemfile @@ -4,7 +4,7 @@ gemspec path: ".." gem "rake" gem "minitest", ">= 5" -gem "activerecord", "7.1.0.beta1" +gem "activerecord", "~> 7.0.0" gem "sqlite3" gem "daru" gem "matrix" # for daru From 6d785f8cd62148e15de52561114743e462a047ba Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Tue, 26 Dec 2023 11:04:40 -0500 Subject: [PATCH 141/158] Test with Ruby 3.3 on CI --- .github/workflows/build.yml | 4 +++- Gemfile | 1 + 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 20ab04a..46af15c 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -6,6 +6,8 @@ jobs: fail-fast: false matrix: include: + - ruby: 3.3 + gemfile: Gemfile - ruby: 3.2 gemfile: Gemfile - ruby: 3.1 @@ -18,7 +20,7 @@ jobs: env: BUNDLE_GEMFILE: ${{ matrix.gemfile }} steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - uses: ruby/setup-ruby@v1 with: ruby-version: ${{ matrix.ruby }} diff --git a/Gemfile b/Gemfile index a3e4f27..c1e2fc4 100644 --- a/Gemfile +++ b/Gemfile @@ -11,3 +11,4 @@ gem "matrix" # for daru gem "rover-df" gem "ngt", ">= 0.3.0" gem "faiss" +gem "csv" From 7c5555c00bc0d7430379e9057786edfded607244 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sat, 17 Feb 2024 10:19:59 -0800 Subject: [PATCH 142/158] Updated badge [skip ci] --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index dbe9ddd..75d2da0 100644 --- a/README.md +++ b/README.md @@ -6,7 +6,7 @@ - Works with explicit and implicit feedback - Uses high-performance matrix factorization -[![Build Status](https://github.com/ankane/disco/workflows/build/badge.svg?branch=master)](https://github.com/ankane/disco/actions) +[![Build Status](https://github.com/ankane/disco/actions/workflows/build.yml/badge.svg)](https://github.com/ankane/disco/actions) ## Installation From bce703cedcfa3bf02d22018cc6b06471a1906115 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Wed, 22 May 2024 20:42:37 -0400 Subject: [PATCH 143/158] Fixed CI --- Gemfile | 2 +- gemfiles/activerecord60.gemfile | 2 +- gemfiles/activerecord61.gemfile | 2 +- gemfiles/activerecord70.gemfile | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/Gemfile b/Gemfile index c1e2fc4..b88c365 100644 --- a/Gemfile +++ b/Gemfile @@ -5,7 +5,7 @@ gemspec gem "rake" gem "minitest", ">= 5" gem "activerecord", "~> 7.1.0" -gem "sqlite3" +gem "sqlite3", "< 2" gem "daru" gem "matrix" # for daru gem "rover-df" diff --git a/gemfiles/activerecord60.gemfile b/gemfiles/activerecord60.gemfile index ef9bee6..67108e7 100644 --- a/gemfiles/activerecord60.gemfile +++ b/gemfiles/activerecord60.gemfile @@ -5,7 +5,7 @@ gemspec path: ".." gem "rake" gem "minitest", ">= 5" gem "activerecord", "~> 6.0.0" -gem "sqlite3" +gem "sqlite3", "< 2" gem "daru" gem "rover-df" gem "ngt", ">= 0.3.0" diff --git a/gemfiles/activerecord61.gemfile b/gemfiles/activerecord61.gemfile index cc2fd6a..d811d55 100644 --- a/gemfiles/activerecord61.gemfile +++ b/gemfiles/activerecord61.gemfile @@ -5,7 +5,7 @@ gemspec path: ".." gem "rake" gem "minitest", ">= 5" gem "activerecord", "~> 6.1.0" -gem "sqlite3" +gem "sqlite3", "< 2" gem "daru" gem "rover-df" gem "ngt", ">= 0.3.0" diff --git a/gemfiles/activerecord70.gemfile b/gemfiles/activerecord70.gemfile index 7fa56a2..c1d41ab 100644 --- a/gemfiles/activerecord70.gemfile +++ b/gemfiles/activerecord70.gemfile @@ -5,7 +5,7 @@ gemspec path: ".." gem "rake" gem "minitest", ">= 5" gem "activerecord", "~> 7.0.0" -gem "sqlite3" +gem "sqlite3", "< 2" gem "daru" gem "matrix" # for daru gem "rover-df" From 7c8a397d1233f2998e62a8c44818caef8dc10130 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Wed, 22 May 2024 21:06:32 -0400 Subject: [PATCH 144/158] Reduced memory for item_recs and similar_users --- CHANGELOG.md | 4 ++++ lib/disco/recommender.rb | 35 ++++++++++++++++++----------------- 2 files changed, 22 insertions(+), 17 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 3eb79a3..673e358 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,7 @@ +## 0.4.1 (unreleased) + +- Reduced memory for `item_recs` and `similar_users` + ## 0.4.0 (2023-01-30) - Fixed issue with `has_recommended` and inheritance with Rails < 6.1 diff --git a/lib/disco/recommender.rb b/lib/disco/recommender.rb index 8d68269..7589b1c 100644 --- a/lib/disco/recommender.rb +++ b/lib/disco/recommender.rb @@ -99,8 +99,8 @@ def fit(train_set, validation_set: nil) @user_factors = model.p_factors(format: :numo) @item_factors = model.q_factors(format: :numo) - @normalized_user_factors = nil - @normalized_item_factors = nil + @user_norms = nil + @item_norms = nil @user_recs_index = nil @similar_users_index = nil @@ -172,13 +172,13 @@ def user_recs(user_id, count: 5, item_ids: nil) def similar_items(item_id, count: 5) check_fit - similar(item_id, :item_id, @item_map, normalized_item_factors, count, @similar_items_index) + similar(item_id, :item_id, @item_map, @item_factors, item_norms, count, @similar_items_index) end alias_method :item_recs, :similar_items def similar_users(user_id, count: 5) check_fit - similar(user_id, :user_id, @user_map, normalized_user_factors, count, @similar_users_index) + similar(user_id, :user_id, @user_map, @user_factors, user_norms, count, @similar_users_index) end def top_items(count: 5) @@ -247,13 +247,13 @@ def optimize_user_recs def optimize_similar_items(library: nil) check_fit - @similar_items_index = create_index(normalized_item_factors, library: library) + @similar_items_index = create_index(@item_factors / item_norms.expand_dims(1), library: library) end alias_method :optimize_item_recs, :optimize_similar_items def optimize_similar_users(library: nil) check_fit - @similar_users_index = create_index(normalized_user_factors, library: library) + @similar_users_index = create_index(@user_factors / user_norms.expand_dims(1), library: library) end def inspect @@ -341,36 +341,37 @@ def create_index(factors, library:) end end - def normalized_user_factors - @normalized_user_factors ||= normalize(@user_factors) + def user_norms + @user_norms ||= norms(@user_factors) end - def normalized_item_factors - @normalized_item_factors ||= normalize(@item_factors) + def item_norms + @item_norms ||= norms(@item_factors) end - def normalize(factors) + def norms(factors) norms = Numo::SFloat::Math.sqrt((factors * factors).sum(axis: 1)) norms[norms.eq(0)] = 1e-10 # no zeros - factors / norms.expand_dims(1) + norms end - def similar(id, key, map, norm_factors, count, index) + def similar(id, key, map, factors, norms, count, index) i = map[id] - if i && norm_factors.shape[0] > 1 + if i && factors.shape[0] > 1 if index && count + norm_factors = factors[i, true] / norms[i] if defined?(Faiss) && index.is_a?(Faiss::Index) - predictions, ids = index.search(norm_factors[i, true].expand_dims(0), count + 1).map { |v| v.to_a[0] } + predictions, ids = index.search(norm_factors.expand_dims(0), count + 1).map { |v| v.to_a[0] } else - result = index.search(norm_factors[i, true], size: count + 1) + result = index.search(norm_factors, size: count + 1) # ids from batch_insert start at 1 instead of 0 ids = result.map { |v| v[:id] - 1 } # convert cosine distance to cosine similarity predictions = result.map { |v| 1 - v[:distance] } end else - predictions = norm_factors.inner(norm_factors[i, true]) + predictions = factors.inner(factors[i, true]) / (norms[i] * norms) indexes = predictions.sort_index.reverse indexes = indexes[0...[count + 1, indexes.size].min] if count predictions = predictions[indexes] From ea3a19f3f4ec5c09f41fff705ad7cfc4473faf2c Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Thu, 23 May 2024 10:03:29 -0400 Subject: [PATCH 145/158] Updated order [skip ci] --- lib/disco/recommender.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/disco/recommender.rb b/lib/disco/recommender.rb index 7589b1c..72e6f73 100644 --- a/lib/disco/recommender.rb +++ b/lib/disco/recommender.rb @@ -371,7 +371,7 @@ def similar(id, key, map, factors, norms, count, index) predictions = result.map { |v| 1 - v[:distance] } end else - predictions = factors.inner(factors[i, true]) / (norms[i] * norms) + predictions = factors.inner(factors[i, true]) / (norms * norms[i]) indexes = predictions.sort_index.reverse indexes = indexes[0...[count + 1, indexes.size].min] if count predictions = predictions[indexes] From 6a1a4780e44f99e5e66c65ae393b914e83421e99 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Thu, 23 May 2024 10:04:08 -0400 Subject: [PATCH 146/158] Version bump to 0.4.1 [skip ci] --- CHANGELOG.md | 2 +- lib/disco/version.rb | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 673e358..2be2f5b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,4 @@ -## 0.4.1 (unreleased) +## 0.4.1 (2024-05-23) - Reduced memory for `item_recs` and `similar_users` diff --git a/lib/disco/version.rb b/lib/disco/version.rb index df1c6a9..fb24cb8 100644 --- a/lib/disco/version.rb +++ b/lib/disco/version.rb @@ -1,3 +1,3 @@ module Disco - VERSION = "0.4.0" + VERSION = "0.4.1" end From 77ff5bc13e41a01866132cf3867c04e09270aae7 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sun, 9 Jun 2024 19:56:17 -0700 Subject: [PATCH 147/158] Test with Active Record 7.2.0.beta2 on CI --- .github/workflows/build.yml | 2 +- gemfiles/activerecord72.gemfile | 14 ++++++++++++++ 2 files changed, 15 insertions(+), 1 deletion(-) create mode 100644 gemfiles/activerecord72.gemfile diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 46af15c..1335c5e 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -7,7 +7,7 @@ jobs: matrix: include: - ruby: 3.3 - gemfile: Gemfile + gemfile: gemfiles/activerecord72.gemfile - ruby: 3.2 gemfile: Gemfile - ruby: 3.1 diff --git a/gemfiles/activerecord72.gemfile b/gemfiles/activerecord72.gemfile new file mode 100644 index 0000000..223b6b8 --- /dev/null +++ b/gemfiles/activerecord72.gemfile @@ -0,0 +1,14 @@ +source "https://rubygems.org" + +gemspec path: ".." + +gem "rake" +gem "minitest", ">= 5" +gem "activerecord", "~> 7.2.0.beta2" +gem "sqlite3" +gem "daru" +gem "matrix" # for daru +gem "rover-df" +gem "ngt", ">= 0.3.0" +gem "faiss" +gem "wilson_score" From 5bf3a5860637ba9da04162c6b011fd13ca169e57 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sun, 9 Jun 2024 19:57:39 -0700 Subject: [PATCH 148/158] Updated CI --- gemfiles/activerecord60.gemfile | 1 - gemfiles/activerecord61.gemfile | 1 - gemfiles/activerecord70.gemfile | 1 - gemfiles/activerecord72.gemfile | 2 +- 4 files changed, 1 insertion(+), 4 deletions(-) diff --git a/gemfiles/activerecord60.gemfile b/gemfiles/activerecord60.gemfile index 67108e7..23cad1b 100644 --- a/gemfiles/activerecord60.gemfile +++ b/gemfiles/activerecord60.gemfile @@ -10,4 +10,3 @@ gem "daru" gem "rover-df" gem "ngt", ">= 0.3.0" gem "faiss" -gem "wilson_score" diff --git a/gemfiles/activerecord61.gemfile b/gemfiles/activerecord61.gemfile index d811d55..979b068 100644 --- a/gemfiles/activerecord61.gemfile +++ b/gemfiles/activerecord61.gemfile @@ -10,4 +10,3 @@ gem "daru" gem "rover-df" gem "ngt", ">= 0.3.0" gem "faiss" -gem "wilson_score" diff --git a/gemfiles/activerecord70.gemfile b/gemfiles/activerecord70.gemfile index c1d41ab..1ff6fe4 100644 --- a/gemfiles/activerecord70.gemfile +++ b/gemfiles/activerecord70.gemfile @@ -11,4 +11,3 @@ gem "matrix" # for daru gem "rover-df" gem "ngt", ">= 0.3.0" gem "faiss" -gem "wilson_score" diff --git a/gemfiles/activerecord72.gemfile b/gemfiles/activerecord72.gemfile index 223b6b8..62d00de 100644 --- a/gemfiles/activerecord72.gemfile +++ b/gemfiles/activerecord72.gemfile @@ -11,4 +11,4 @@ gem "matrix" # for daru gem "rover-df" gem "ngt", ">= 0.3.0" gem "faiss" -gem "wilson_score" +gem "csv" From cb9ba53e1c88c8dd2280dc7c859ac146c654d5ca Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sun, 9 Jun 2024 20:07:55 -0700 Subject: [PATCH 149/158] Removed dependency on csv gem for load_movielens --- CHANGELOG.md | 4 ++++ Gemfile | 1 - gemfiles/activerecord72.gemfile | 1 - lib/disco/data.rb | 12 +++++------- 4 files changed, 9 insertions(+), 9 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 2be2f5b..c29cbcd 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,7 @@ +## 0.4.2 (unreleased) + +- Removed dependency on `csv` gem for `load_movielens` + ## 0.4.1 (2024-05-23) - Reduced memory for `item_recs` and `similar_users` diff --git a/Gemfile b/Gemfile index b88c365..394375e 100644 --- a/Gemfile +++ b/Gemfile @@ -11,4 +11,3 @@ gem "matrix" # for daru gem "rover-df" gem "ngt", ">= 0.3.0" gem "faiss" -gem "csv" diff --git a/gemfiles/activerecord72.gemfile b/gemfiles/activerecord72.gemfile index 62d00de..ba6b453 100644 --- a/gemfiles/activerecord72.gemfile +++ b/gemfiles/activerecord72.gemfile @@ -11,4 +11,3 @@ gem "matrix" # for daru gem "rover-df" gem "ngt", ">= 0.3.0" gem "faiss" -gem "csv" diff --git a/lib/disco/data.rb b/lib/disco/data.rb index 640053a..4c299e6 100644 --- a/lib/disco/data.rb +++ b/lib/disco/data.rb @@ -1,23 +1,21 @@ module Disco module Data def load_movielens - require "csv" - item_path = download_file("ml-100k/u.item", "https://files.grouplens.org/datasets/movielens/ml-100k/u.item", file_hash: "553841ebc7de3a0fd0d6b62a204ea30c1e651aacfb2814c7a6584ac52f2c5701") data_path = download_file("ml-100k/u.data", "https://files.grouplens.org/datasets/movielens/ml-100k/u.data", file_hash: "06416e597f82b7342361e41163890c81036900f418ad91315590814211dca490") - # convert u.item to utf-8 - movies_str = File.read(item_path).encode("UTF-8", "ISO-8859-1") - movies = {} - CSV.parse(movies_str, col_sep: "|") do |row| + File.foreach(item_path) do |line| + # convert u.item to utf-8 + row = line.encode("UTF-8", "ISO-8859-1").split("|") movies[row[0]] = row[1] end data = [] - CSV.foreach(data_path, col_sep: "\t") do |row| + File.foreach(data_path) do |line| + row = line.split("\t") data << { user_id: row[0].to_i, item_id: movies[row[1]], From ea6618802f1767082fad4297fd8a58d69abac481 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sun, 9 Jun 2024 20:08:28 -0700 Subject: [PATCH 150/158] Removed comment [skip ci] --- lib/disco/data.rb | 1 - 1 file changed, 1 deletion(-) diff --git a/lib/disco/data.rb b/lib/disco/data.rb index 4c299e6..b9e6524 100644 --- a/lib/disco/data.rb +++ b/lib/disco/data.rb @@ -8,7 +8,6 @@ def load_movielens movies = {} File.foreach(item_path) do |line| - # convert u.item to utf-8 row = line.encode("UTF-8", "ISO-8859-1").split("|") movies[row[0]] = row[1] end From ebf4336a0a840bf16160b0dec2dbc15528b2b841 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Mon, 17 Jun 2024 23:43:55 -0700 Subject: [PATCH 151/158] Improved style [skip ci] --- test/recommender_test.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/recommender_test.rb b/test/recommender_test.rb index 5043e61..ad077a4 100644 --- a/test/recommender_test.rb +++ b/test/recommender_test.rb @@ -375,7 +375,7 @@ def test_multiple_user_item train_set = [ {user_id: 1, item_id: 2, rating: 1}, - {user_id: 1, item_id: 2, rating: 2}, + {user_id: 1, item_id: 2, rating: 2} ] recommender = Disco::Recommender.new error = assert_raises ArgumentError do From 869f1418cada4fb73c84de206de39b015cc61cde Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Mon, 24 Jun 2024 11:24:55 -0700 Subject: [PATCH 152/158] Updated license year [skip ci] --- LICENSE.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/LICENSE.txt b/LICENSE.txt index a8e7f50..55abd58 100644 --- a/LICENSE.txt +++ b/LICENSE.txt @@ -1,4 +1,4 @@ -Copyright (c) 2019-2022 Andrew Kane +Copyright (c) 2019-2024 Andrew Kane MIT License From a19430b163c36fb3570ed96384410ca852a989ef Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Mon, 24 Jun 2024 11:25:07 -0700 Subject: [PATCH 153/158] Version bump to 0.4.2 [skip ci] --- CHANGELOG.md | 2 +- lib/disco/version.rb | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index c29cbcd..ead6fbb 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,4 @@ -## 0.4.2 (unreleased) +## 0.4.2 (2024-06-24) - Removed dependency on `csv` gem for `load_movielens` diff --git a/lib/disco/version.rb b/lib/disco/version.rb index fb24cb8..c1cfd7c 100644 --- a/lib/disco/version.rb +++ b/lib/disco/version.rb @@ -1,3 +1,3 @@ module Disco - VERSION = "0.4.1" + VERSION = "0.4.2" end From cbbe4db3ccd02a98bead54228b1cdea1b8cf0c6c Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Mon, 24 Jun 2024 21:35:01 -0700 Subject: [PATCH 154/158] Improved code [skip ci] --- lib/disco/recommender.rb | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/disco/recommender.rb b/lib/disco/recommender.rb index 72e6f73..80cfbec 100644 --- a/lib/disco/recommender.rb +++ b/lib/disco/recommender.rb @@ -64,8 +64,8 @@ def fit(train_set, validation_set: nil) end if @top_items - @item_count = [0] * @item_map.size - @item_sum = [0.0] * @item_map.size + @item_count = Array.new(@item_map.size, 0) + @item_sum = Array.new(@item_map.size, 0.0) train_set.each do |v| i = @item_map[v[:item_id]] @item_count[i] += 1 From 3047aa6a9837cd0ffbf6ac9607cec0b42d42a4e2 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sun, 21 Jul 2024 11:07:47 -0700 Subject: [PATCH 155/158] Improved test [skip ci] --- test/recommender_test.rb | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/test/recommender_test.rb b/test/recommender_test.rb index ad077a4..357cef4 100644 --- a/test/recommender_test.rb +++ b/test/recommender_test.rb @@ -23,9 +23,7 @@ def test_explicit assert_equal [1664, 20], recommender.item_factors.shape assert_equal [943, 20], recommender.user_factors.shape - - expected = data.map { |v| v[:rating] }.sum / data.size.to_f - assert_in_delta expected, recommender.global_mean + assert_in_delta 3.52986, recommender.global_mean recs = recommender.item_recs("Star Wars (1977)") assert_equal 5, recs.size From 506eba20888379d870e29703d30390482363a54f Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Fri, 26 Jul 2024 21:32:36 -0700 Subject: [PATCH 156/158] Updated cache action [skip ci] --- .github/workflows/build.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 1335c5e..4bb8899 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -25,7 +25,7 @@ jobs: with: ruby-version: ${{ matrix.ruby }} bundler-cache: true - - uses: actions/cache@v3 + - uses: actions/cache@v4 with: path: ~/.disco key: disco From 90b84954bfeb078b4f5c46c86abc605a7be4ea70 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sat, 10 Aug 2024 14:52:44 -0700 Subject: [PATCH 157/158] Test with Active Record 7.2.0 on CI --- .github/workflows/build.yml | 4 ++-- Gemfile | 4 ++-- gemfiles/{activerecord72.gemfile => activerecord71.gemfile} | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) rename gemfiles/{activerecord72.gemfile => activerecord71.gemfile} (76%) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 4bb8899..89c59df 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -7,9 +7,9 @@ jobs: matrix: include: - ruby: 3.3 - gemfile: gemfiles/activerecord72.gemfile - - ruby: 3.2 gemfile: Gemfile + - ruby: 3.2 + gemfile: gemfiles/activerecord71.gemfile - ruby: 3.1 gemfile: gemfiles/activerecord70.gemfile - ruby: "3.0" diff --git a/Gemfile b/Gemfile index 394375e..9bc16fd 100644 --- a/Gemfile +++ b/Gemfile @@ -4,8 +4,8 @@ gemspec gem "rake" gem "minitest", ">= 5" -gem "activerecord", "~> 7.1.0" -gem "sqlite3", "< 2" +gem "activerecord", "~> 7.2.0" +gem "sqlite3" gem "daru" gem "matrix" # for daru gem "rover-df" diff --git a/gemfiles/activerecord72.gemfile b/gemfiles/activerecord71.gemfile similarity index 76% rename from gemfiles/activerecord72.gemfile rename to gemfiles/activerecord71.gemfile index ba6b453..adb0692 100644 --- a/gemfiles/activerecord72.gemfile +++ b/gemfiles/activerecord71.gemfile @@ -4,8 +4,8 @@ gemspec path: ".." gem "rake" gem "minitest", ">= 5" -gem "activerecord", "~> 7.2.0.beta2" -gem "sqlite3" +gem "activerecord", "~> 7.1.0" +gem "sqlite3", "< 2" gem "daru" gem "matrix" # for daru gem "rover-df" From 8b5c4dd08a5be525b33319f9f23b3d40323dddb6 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Mon, 26 Aug 2024 17:22:08 -0700 Subject: [PATCH 158/158] Updated link [skip ci] --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 75d2da0..4c46d7d 100644 --- a/README.md +++ b/README.md @@ -194,7 +194,7 @@ json = File.read("recommender.json") recommender = Disco::Recommender.load_json(json) ``` -Alternatively, you can store only the factors and use a library like [Neighbor](https://github.com/ankane/neighbor). See the [examples](https://github.com/ankane/neighbor/tree/master/examples). +Alternatively, you can store only the factors and use a library like [Neighbor](https://github.com/ankane/neighbor). See the [examples](https://github.com/ankane/neighbor/tree/master/examples/disco). ## Algorithms