Skip to content
This repository has been archived by the owner on Jan 15, 2024. It is now read-only.

Add GloVe #359

Merged
merged 5 commits into from
Nov 23, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions .gitmodules
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
[submodule "scripts/word_embeddings/CLI11"]
path = scripts/word_embeddings/extern/CLI11
url = https://github.com/CLIUtils/CLI11.git
[submodule "scripts/word_embeddings/cnpy"]
path = scripts/word_embeddings/extern/cnpy
url = https://github.com/leezu/cnpy
leezu marked this conversation as resolved.
Show resolved Hide resolved
branch = libzip
[submodule "scripts/word_embeddings/sparsepp"]
path = scripts/word_embeddings/extern/sparsepp
url = https://github.com/greg7mdp/sparsepp.git
61 changes: 61 additions & 0 deletions scripts/word_embeddings/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
cmake_minimum_required(VERSION 3.9..3.12) # For OpenMP on OS X via 'brew install libomp' use CMake 3.12+

# * Set and configure build types
if(NOT CMAKE_BUILD_TYPE)
set(CMAKE_BUILD_TYPE Release)
endif()
set(CMAKE_CXX_FLAGS "-Wall -Wextra")
set(CMAKE_CXX_FLAGS_DEBUG "-g")
set(CMAKE_CXX_FLAGS_RELEASE "-O3")

set(CMAKE_CXX_STANDARD 17)
set(CMAKE_CXX_STANDARD_REQUIRED ON)
set(CMAKE_CXX_EXTENSIONS ON)

project(EmbeddingsToolkit VERSION 1.0 LANGUAGES CXX)

# * Update submodules as needed
find_package(Git QUIET)
if(GIT_FOUND)
option(GIT_SUBMODULE "Check submodules during build" ON)
if(GIT_SUBMODULE)
message(STATUS "Submodule update")
execute_process(COMMAND ${GIT_EXECUTABLE} submodule update --init
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
RESULT_VARIABLE GIT_SUBMOD_RESULT)
if(NOT GIT_SUBMOD_RESULT EQUAL "0")
message(FATAL_ERROR "git submodule update --init failed with ${GIT_SUBMOD_RESULT}, please checkout submodules")
endif()
endif()
endif()

# * Compiler flags
# -march=native or similar is required by __sync_bool_compare_and_swap_16 in growt
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=native")

# * Executables
add_subdirectory(extern/cnpy EXCLUDE_FROM_ALL)

# ** vocab_count
add_executable(vocab_count tools/vocab_count.cc)
target_include_directories(vocab_count PUBLIC extern/sparsepp)
target_include_directories(vocab_count PUBLIC extern/CLI11/include)
target_link_libraries(vocab_count PUBLIC stdc++fs)
target_link_libraries(vocab_count PUBLIC pthread)

# ** cooccur
add_executable(cooccur tools/cooccur.cc)
target_link_libraries(cooccur PUBLIC pthread)
target_include_directories(cooccur PUBLIC extern/sparsepp)
target_include_directories(cooccur PUBLIC extern/xxHash)
target_include_directories(cooccur PUBLIC extern/growt)
target_include_directories(cooccur PUBLIC extern/range-v3/include)
target_include_directories(cooccur PUBLIC extern/cnpy)
target_include_directories(cooccur PUBLIC extern/CLI11/include)
target_link_libraries(cooccur PUBLIC stdc++fs)
target_link_libraries(cooccur PUBLIC cnpy)
find_package(OpenMP)
if(OpenMP_CXX_FOUND)
target_link_libraries(cooccur PUBLIC OpenMP::OpenMP_CXX)
endif()

1 change: 1 addition & 0 deletions scripts/word_embeddings/extern/CLI11
Submodule CLI11 added at 899100
1 change: 1 addition & 0 deletions scripts/word_embeddings/extern/cnpy
Submodule cnpy added at 82ff25
1 change: 1 addition & 0 deletions scripts/word_embeddings/extern/sparsepp
Submodule sparsepp added at 6da0f6
314 changes: 314 additions & 0 deletions scripts/word_embeddings/tools/cooccur.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,314 @@
// Tool to calculate word-word cooccurrence statistics
//
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

// * Includes and definitions
#include <cstdlib>
#include <filesystem>
#include <fstream>
#include <functional>
#include <iostream>
#include <numeric>
#include <queue>
#include <random>
#include <set>
#include <sstream>
#include <stdexcept>
#include <thread>
#include <vector>

#include "CLI/CLI.hpp" // command line parser
#include "cnpy.h" // numpy
#include "sparsepp/spp.h" // fast sparse hash map

#include "utils.h"

namespace fs = std::filesystem;
using Vocab = spp::sparse_hash_map<std::string, std::pair<uint32_t, uint32_t>>;
using count_type = float;
leezu marked this conversation as resolved.
Show resolved Hide resolved
using Matrix = spp::sparse_hash_map<uint64_t, count_type>;

// * Arguments
enum class ContextWeight { Harmonic, DistanceOverSize, None };

std::istream &operator>>(std::istream &in, ContextWeight &context_weight) {
int i;
in >> i;
context_weight = static_cast<ContextWeight>(i);
return in;
}

std::ostream &operator<<(std::ostream &in,
const ContextWeight &context_weight) {
return in << static_cast<int>(context_weight);
}

// Arguments specified via command line options. See ParseArgs for
// documentation.
struct Arguments {
unsigned int num_threads = 1;
leezu marked this conversation as resolved.
Show resolved Hide resolved
unsigned int window_size = 5;
bool no_symmetric = false;
bool subsample = false;
ContextWeight context_weight;
};

auto ParseArgs(int argc, char **argv) {
// Performance optimizations for writing to stdout
std::ios::sync_with_stdio(false);

Arguments args;
CLI::App app("Simple tool to calculate word-word cooccurrence statistics");
std::vector<fs::path> files;
app.add_option("FILES", files, "File names")->check(CLI::ExistingPath);
std::string output = "cooccurrences.npz";
app.add_option("-o,--output", output,
"Output file name. Co-occurence matrix is saved as "
"scipy.sparse compatible CSR matrix in a numpy .npz archive");
app.add_option("-w,--window-size", args.window_size,
"Window size in which to count co-occurences.");
app.add_flag("--no-symmetric", args.no_symmetric,
"If not specified, a symmetric context window is used.");
app.add_flag("--subsample", args.subsample,
"Apply subsampling during co-occurence matrix construction as "
"in Word2Vec .");
app.add_set("-c,--context-weights", args.context_weight,
{ContextWeight::Harmonic, ContextWeight::DistanceOverSize,
ContextWeight::None},
"Weighting scheme for contexts.")
->type_name("ContextWeight in {Harmonic=0, DistanceOverSize=1, None=2}");
app.add_option(
"-j,--numThreads", args.num_threads,
"Number of threads to use. Each thread constructs an "
"independent vocabulary which are finally merged. Only appropriate "
"when multiple, sufficiently large input files are specified.")
->check(CLI::Range(1U, std::numeric_limits<unsigned int>::max()));

try {
app.parse(argc, argv);
} catch (const CLI::ParseError &e) {
std::exit(app.exit(e));
}

std::queue<fs::path> paths;
for (auto &file : files) {
paths.emplace(file);
}

return std::make_tuple(paths, output, args);
}

// * Input
auto ReadVocab() {
std::string word;
std::string count;
int rank{0};
Vocab vocab;
while (std::cin >> word) {
std::cin >> count;
vocab[word] = {rank, std::stoi(count)};
rank++;
}
return vocab;
}
// * Co-occurence matrix construction
std::mutex paths_m;
std::mutex matrices_m;

void ReadMatrix(std::queue<fs::path> &paths, queue<Matrix> &matrices,
const Vocab &vocab, const Arguments &args, uint32_t seed) {
assert(seed > 0);
std::string line;
std::deque<uint32_t> history;
std::unique_ptr<Matrix> m = std::make_unique<Matrix>();

// Prepare subsampling
std::random_device r;
std::default_random_engine random_engine(r());
std::uniform_real_distribution<float> uniform_dist(0, 1);
std::vector<double> idx_to_pdiscard;
if (args.subsample) {
double sum_counts = std::accumulate(vocab.begin(), vocab.end(), 0,
[](const auto &sum, const auto &e) {
const auto count = e.second.second;
return sum + count;
});
double t = 1E-4;
for (const auto &e : vocab) {
const auto count = e.second.second;
idx_to_pdiscard.push_back(1 - std::sqrt(t / (count / sum_counts)));
}
}

while (true) {
fs::path path;
{
std::scoped_lock lock(paths_m);
leezu marked this conversation as resolved.
Show resolved Hide resolved
if (paths.empty()) {
break;
}
path = paths.front();
paths.pop();
}

std::ifstream in{path};
if (!in.is_open()) {
throw std::invalid_argument(path.string() + " cannot be opened!");
}
while (std::getline(in, line)) {
history.clear(); // Discard context from other lines
std::stringstream stream(line);
std::string word;
while (stream >> word) {
// TODO We must construct an extra std::string for every word due to
// missing support for heterogenous lookup in unordered map. Once
// https://wg21.link/P0919 is merged construct a string_view instead.
// std::string_view(&*word.begin(), ranges::distance(word))
auto word_rank_it = vocab.find(word);
// Skip words not contained in the vocabulary
if (word_rank_it != vocab.end()) {
uint32_t word_rank = word_rank_it->second.first;

if (args.subsample &&
uniform_dist(random_engine) <= idx_to_pdiscard[word_rank]) {
continue;
}

for (unsigned int distance = 1; distance <= history.size();
distance++) {
const auto &context_word_rank = history[distance - 1];
uint64_t key; // We merge 32 bit row and col indices to a single 64
// bit key
// For symmetric contexts, only store one direction.
if (!args.no_symmetric) {
if (word_rank <= context_word_rank) {
key = (static_cast<uint64_t>(word_rank) << 32) |
context_word_rank;
} else {
key = word_rank |
(static_cast<uint64_t>(context_word_rank) << 32);
}
} else {
key =
(static_cast<uint64_t>(word_rank) << 32) | context_word_rank;
}

if (args.context_weight == ContextWeight::Harmonic) {
(*m)[key] += 1.0f / static_cast<count_type>(distance);
} else if (args.context_weight == ContextWeight::DistanceOverSize) {
(*m)[key] += (args.window_size - distance - 1) / args.window_size;
} else {
(*m)[key]++;
}
}

// Update circular history buffer
if (history.size() == args.window_size) {
history.pop_front();
}
history.push_back(word_rank);
}
}
}
}
{
std::scoped_lock lock(matrices_m);
matrices.push(std::move(m));
}
}

std::unique_ptr<Matrix> CombineMatrices(queue<Matrix> &matrices,
int num_threads) {
std::unique_ptr<Matrix> m1 = matrices.pop();
for (int i = 1; i < num_threads; i++) {
std::unique_ptr<Matrix> m2 = matrices.pop();
if (m1->size() < m2->size()) {
for (const auto &e : *m1) {
(*m2)[e.first] += e.second;
}
std::swap(m1, m2);
} else {
for (const auto &e : *m2) {
(*m1)[e.first] += e.second;
}
}
}
return m1;
}

auto ComputeCooccurrenceMatrix(Vocab &vocab, std::queue<fs::path> &paths,
const Arguments &args) {
std::vector<std::thread> threads;
queue<Matrix> matrices;
for (unsigned int i = 0; i < args.num_threads; i++) {
threads.push_back(std::thread([&paths, &matrices, &vocab, &args, i]() {
ReadMatrix(std::ref(paths), std::ref(matrices), std::ref(vocab),
std::ref(args), i + 1);
}));
}
std::unique_ptr<Matrix> m = CombineMatrices(matrices, args.num_threads);
for (unsigned int i = 0; i < args.num_threads; i++) {
threads[i].join();
}
return m;
}

auto ToCOO(const Vocab &vocab, std::unique_ptr<Matrix> m) {
size_t num_tokens = vocab.size();
size_t nnz = m->size();
std::cout << "Got " << nnz
<< " non-zero entries in cooccurrence matrix of shape ("
<< num_tokens << ", " << num_tokens << ")" << std::endl;
std::vector<uint32_t> row;
std::vector<uint32_t> col;
std::vector<count_type> data;
row.reserve(nnz);
col.reserve(nnz);
data.reserve(nnz);
for (const auto &e : *m) {
row.push_back(e.first >> 32);
col.push_back(e.first & 0xffffffff);
data.push_back(e.second);
}
return std::make_tuple(row, col, data);
}

// * Output
void WriteNumpy(const std::string output, const std::vector<uint32_t> &row,
const std::vector<uint32_t> &col,
const std::vector<count_type> &data, const bool symmetric,
const uint32_t num_tokens) {

assert(row.size() == data.size());
assert(col.size() == data.size());
cnpy::npz_save(output, "row", &row[0], {row.size()}, "w");
cnpy::npz_save(output, "col", &col[0], {col.size()}, "a");
cnpy::npz_save(output, "data", &data[0], {data.size()}, "a");
cnpy::npz_save(output, "num_tokens", &num_tokens, {1}, "a");
cnpy::npz_save(output, "symmetric", &symmetric, {1}, "a");
}

// * Main
int main(int argc, char **argv) {
auto [paths, output, args] = ParseArgs(argc, argv);
auto vocab = ReadVocab();
auto cooccurenceMatrix = ComputeCooccurrenceMatrix(vocab, paths, args);
auto [row, col, data] = ToCOO(vocab, std::move(cooccurenceMatrix));
WriteNumpy(output, row, col, data, !args.no_symmetric, vocab.size());
return 0;
}
Loading