dismecpp/prediction_2prediction_8cpp_source.html

 // Copyright (c) 2021, Aalto University, developed by Erik Schultheis

 // All rights reserved.

 //

 // SPDX-License-Identifier: MIT


 #include "prediction.h"


 #include <utility>

 #include "data/data.h"

 #include "spdlog/spdlog.h"

 #include "spdlog/fmt/fmt.h"

 #include "model/model.h"


 using namespace dismec;

 using namespace dismec::prediction;


 PredictionBase::PredictionBase(const DatasetBase* data,

                                std::shared_ptr<const Model> model) :

         m_Data(data), m_Model(std::move(model)), m_FeatureReplicator(m_Data->get_features())

 {

     if(m_Model->num_labels() != data->num_labels()) {

         throw std::invalid_argument(

                 fmt::format("Mismatched number of labels between model ({}) and data ({})",

                             m_Model->num_labels(), data->num_labels()));

     }


     if(m_Model->num_features() != data->num_features()) {

         throw std::invalid_argument(

                 fmt::format("Mismatched number of features between model ({}) and data ({})",

                             m_Model->num_features(), data->num_features()));

     }

 }


 void PredictionBase::make_thread_local_features(long num_threads) {

     m_ThreadLocalFeatures.resize(num_threads);

 }


 void PredictionBase::init_thread(parallel::thread_id_t thread_id) {

     m_ThreadLocalFeatures.at(thread_id.to_index()) = m_FeatureReplicator.get_local();

 }


 namespace {

     Model::FeatureMatrixIn make_matrix(const DenseFeatures& features, long begin, long end) {

         return Model::FeatureMatrixIn::DenseRowMajorRef{features.middleRows(begin, end-begin)};

     }

     Model::FeatureMatrixIn make_matrix(const SparseFeatures& features, long begin, long end) {

         return Model::FeatureMatrixIn::SparseRowMajorRef{features.middleRows(begin, end-begin)};

     }

 }


 void PredictionBase::do_prediction(long begin, long end, thread_id_t thread_id, Eigen::Ref<PredictionMatrix> target) {

     auto& local_features = m_ThreadLocalFeatures.at(thread_id.to_index());

     visit([&](const auto& features){

               m_Model->predict_scores(make_matrix(features, begin, end), target);

         }, *local_features);


 }


 FullPredictionTaskGenerator::FullPredictionTaskGenerator(const DatasetBase* data, std::shared_ptr<const Model> model) :

     PredictionBase(data, std::move(model))

 {

     m_Predictions.resize(data->num_examples(), data->num_labels());

 }


 long FullPredictionTaskGenerator::num_tasks() const

 {

     return m_Data->num_examples();

 }


 void FullPredictionTaskGenerator::run_tasks(long begin, long end, thread_id_t thread_id)

 {

     do_prediction(begin, end, thread_id, m_Predictions.middleRows(begin, end - begin));

 }


 void FullPredictionTaskGenerator::prepare(long num_threads, long chunk_size) {

     make_thread_local_features(num_threads);

 }


 TopKPredictionTaskGenerator::TopKPredictionTaskGenerator(const DatasetBase* data, std::shared_ptr<const Model> model, long K) :

         PredictionBase(data, std::move(model)), m_K(K)

 {

     m_TopKValues.resize(data->num_examples(), m_K);

     m_TopKIndices.resize(data->num_examples(), m_K);

     m_TopKValues.setConstant(-std::numeric_limits<real_t>::infinity());


     // generate a transpose of the label matrix

     std::vector<std::vector<long>> examples_to_labels(data->num_examples());

     for(label_id_t label{0}; label.to_index() < data->num_labels(); ++label) {

         for(auto example : dynamic_cast<const MultiLabelData*>(data)->get_label_instances(label)) {

             examples_to_labels[example].push_back(label.to_index());

         }

     }


     m_GroundTruth = std::move(examples_to_labels);

     m_ConfusionMatrix.fill(std::int64_t{0});

 }


 long TopKPredictionTaskGenerator::num_tasks() const {

     return m_Data->num_examples();

 }


 void TopKPredictionTaskGenerator::prepare(long num_threads, long chunk_size) {

     m_ThreadLocalPredictionCache.resize(num_threads);

     for(auto& cache : m_ThreadLocalPredictionCache) {

         cache.resize(chunk_size, m_Model->num_weights());

     }

     m_ThreadLocalTopKIndices.resize(num_threads);

     for(auto& cache : m_ThreadLocalTopKIndices) {

         cache.resize(chunk_size, m_K);

     }

     m_ThreadLocalTopKValues.resize(num_threads);

     for(auto& cache : m_ThreadLocalTopKValues) {

         cache.resize(chunk_size, m_K);

     }

     make_thread_local_features(num_threads);


     m_ThreadLocalConfusionMatrix.resize(num_threads);

     for(auto& cache : m_ThreadLocalConfusionMatrix) {

         cache.fill(0);

     }

 }


 void TopKPredictionTaskGenerator::finalize() {

     m_ThreadLocalPredictionCache.clear();

     for(auto& tl_cm: m_ThreadLocalConfusionMatrix) {

         for(int i = 0; i < 4; ++i) {

             m_ConfusionMatrix[i] += tl_cm[i];

         }

     }

 }


 void TopKPredictionTaskGenerator::run_tasks(long begin, long end, thread_id_t thread_id) {

     auto& prediction_matrix = m_ThreadLocalPredictionCache.at(thread_id.to_index());

     auto& topk_vals = m_ThreadLocalTopKValues.at(thread_id.to_index());

     auto& topk_idx = m_ThreadLocalTopKIndices.at(thread_id.to_index());

     auto& cm = m_ThreadLocalConfusionMatrix.at(thread_id.to_index());


     // quick access to the label indices that are currently active

     long index_offset = m_Model->labels_begin().to_index();

     long last_index = m_Model->labels_end().to_index();


     // load from global buffer, in case we do a reduction

     topk_idx = m_TopKIndices.middleRows(begin, end-begin);

     topk_vals = m_TopKValues.middleRows(begin, end-begin);


     // generate raw predictions in prediction_matrix

     do_prediction(begin, end, thread_id, prediction_matrix.middleRows(0, end-begin));


     // confusion matrix

     std::int64_t true_positives = 0;

     std::int64_t num_gt_positives = 0;

     for(long sample = begin; sample < end; ++sample) {

         // iterate over all true values

         for(auto& gt : m_GroundTruth[sample])

         {

             // we have to take into account that we are potentially only looking at a subset of the labels.

             if(gt < index_offset) continue;

             if(gt >= last_index) break;


             // correctly predicted true label

             if(prediction_matrix.coeff(sample - begin, gt - index_offset) > 0) {

                 ++true_positives;

             }

             ++num_gt_positives;

         }

     }


     std::int64_t positive_prediction = 0;

     for(long t = 0; t < end - begin; ++t) {

         double threshold = topk_vals.coeff(t, m_TopKValues.cols() - 1);


         // reduce to top k

         for(long j = 0; j < prediction_matrix.cols(); ++j)

         {

             real_t value = prediction_matrix.coeff(t, j);

             if(value > 0)   ++positive_prediction;

             if(value < threshold) {

                 continue;

             }


             long index = index_offset + j;

             for(long k = 0; k < m_K; ++k) {

                 // search for the first entry where we are larger. Once we've inserted this value,

                 // move the other values to the right.

                 if(value > topk_vals.coeff(t, k)) {

                     value = std::exchange(topk_vals.coeffRef(t, k), value);

                     index = std::exchange(topk_idx.coeffRef(t, k), index);

                 }

             }


             // update the threshold: this is the value in the last column

             threshold = topk_vals.coeff(t, topk_vals.cols() - 1);

         }

     }


     std::int64_t total = (end - begin) * prediction_matrix.cols();

     std::int64_t true_neg = total - positive_prediction - num_gt_positives + true_positives;


     cm[TRUE_POSITIVES]  += true_positives;

     cm[FALSE_NEGATIVES] += num_gt_positives - true_positives;

     cm[FALSE_POSITIVES] += positive_prediction - true_positives;

     cm[TRUE_NEGATIVES]  += true_neg;


     // copy to global buffer

     m_TopKIndices.middleRows(begin, end-begin) = topk_idx;

     m_TopKValues.middleRows(begin, end-begin) = topk_vals;

 }


 void TopKPredictionTaskGenerator::update_model(std::shared_ptr<const Model> model) {

     m_Model = std::move(model);

 }

dismec::DatasetBase
Definition: data.h:15

dismec::DatasetBase::num_examples
long num_examples() const noexcept
Get the total number of instances, i.e. the number of rows in the feature matrix.
Definition: data.cpp:52

dismec::DatasetBase::num_labels
virtual long num_labels() const noexcept=0

dismec::DatasetBase::num_features
long num_features() const noexcept
Get the total number of features, i.e. the number of columns in the feature matrix.
Definition: data.cpp:48

dismec::MultiLabelData
Definition: data.h:86

dismec::label_id_t
Strong typedef for an int to signify a label id.
Definition: types.h:20

dismec::opaque_int_type::to_index
constexpr T to_index() const
! Explicitly convert to an integer.
Definition: opaque_int.h:32

dismec::parallel::thread_id_t
Strong typedef for an int to signify a thread id.
Definition: thread_id.h:20

dismec::prediction::FullPredictionTaskGenerator::num_tasks
long num_tasks() const override
Definition: prediction.cpp:65

dismec::prediction::FullPredictionTaskGenerator::run_tasks
void run_tasks(long begin, long end, thread_id_t thread_id) override
Definition: prediction.cpp:70

dismec::prediction::FullPredictionTaskGenerator::m_Predictions
PredictionMatrix m_Predictions
Definition: prediction.h:81

dismec::prediction::FullPredictionTaskGenerator::FullPredictionTaskGenerator
FullPredictionTaskGenerator(const DatasetBase *data, std::shared_ptr< const Model > model)
Definition: prediction.cpp:59

dismec::prediction::FullPredictionTaskGenerator::prepare
void prepare(long num_threads, long chunk_size) override
Called to notify the TaskGenerator about the number of threads.
Definition: prediction.cpp:75

dismec::prediction::PredictionBase
Base class for handling predictions.
Definition: prediction.h:34

dismec::prediction::PredictionBase::do_prediction
void do_prediction(long begin, long end, thread_id_t thread_id, Eigen::Ref< PredictionMatrix > target)
Predicts the scores for a subset of the instances given by the half-open interval [begin,...
Definition: prediction.cpp:51

dismec::prediction::PredictionBase::init_thread
void init_thread(thread_id_t thread_id) final
Called once a thread has spun up, but before it runs its first task.
Definition: prediction.cpp:38

dismec::prediction::PredictionBase::PredictionBase
PredictionBase(const DatasetBase *data, std::shared_ptr< const Model > model)
Constructor, checks that data and model are compatible.
Definition: prediction.cpp:17

dismec::prediction::PredictionBase::m_ThreadLocalFeatures
std::vector< std::shared_ptr< const GenericFeatureMatrix > > m_ThreadLocalFeatures
Definition: prediction.h:67

dismec::prediction::PredictionBase::m_Model
std::shared_ptr< const Model > m_Model
Model (possibly partial) for which prediction is run.
Definition: prediction.h:41

dismec::prediction::PredictionBase::m_Data
const DatasetBase * m_Data
Data on which the prediction is run.
Definition: prediction.h:40

dismec::prediction::PredictionBase::m_FeatureReplicator
parallel::NUMAReplicator< const GenericFeatureMatrix > m_FeatureReplicator
The NUMAReplicator that generates NUMA-local copies for the feature matrices.
Definition: prediction.h:63

dismec::prediction::PredictionBase::make_thread_local_features
void make_thread_local_features(long num_threads)
Definition: prediction.cpp:34

dismec::prediction::TopKPredictionTaskGenerator::TRUE_POSITIVES
static constexpr const int TRUE_POSITIVES
Definition: prediction.h:101

dismec::prediction::TopKPredictionTaskGenerator::m_TopKIndices
IndexMatrix m_TopKIndices
Definition: prediction.h:109

dismec::prediction::TopKPredictionTaskGenerator::finalize
void finalize() override
Called after all threads have finished their tasks.
Definition: prediction.cpp:124

dismec::prediction::TopKPredictionTaskGenerator::m_ThreadLocalTopKValues
std::vector< PredictionMatrix > m_ThreadLocalTopKValues
Definition: prediction.h:112

dismec::prediction::TopKPredictionTaskGenerator::m_ThreadLocalPredictionCache
std::vector< PredictionMatrix > m_ThreadLocalPredictionCache
Definition: prediction.h:111

dismec::prediction::TopKPredictionTaskGenerator::update_model
void update_model(std::shared_ptr< const Model > model)
Definition: prediction.cpp:210

dismec::prediction::TopKPredictionTaskGenerator::m_GroundTruth
std::vector< std::vector< long > > m_GroundTruth
Definition: prediction.h:116

dismec::prediction::TopKPredictionTaskGenerator::m_ThreadLocalTopKIndices
std::vector< IndexMatrix > m_ThreadLocalTopKIndices
Definition: prediction.h:113

dismec::prediction::TopKPredictionTaskGenerator::TRUE_NEGATIVES
static constexpr const int TRUE_NEGATIVES
Definition: prediction.h:103

dismec::prediction::TopKPredictionTaskGenerator::m_K
long m_K
Definition: prediction.h:106

dismec::prediction::TopKPredictionTaskGenerator::TopKPredictionTaskGenerator
TopKPredictionTaskGenerator(const DatasetBase *data, std::shared_ptr< const Model > model, long K)
Definition: prediction.cpp:80

dismec::prediction::TopKPredictionTaskGenerator::FALSE_POSITIVES
static constexpr const int FALSE_POSITIVES
Definition: prediction.h:102

dismec::prediction::TopKPredictionTaskGenerator::FALSE_NEGATIVES
static constexpr const int FALSE_NEGATIVES
Definition: prediction.h:104

dismec::prediction::TopKPredictionTaskGenerator::run_tasks
void run_tasks(long begin, long end, thread_id_t thread_id) override
Definition: prediction.cpp:133

dismec::prediction::TopKPredictionTaskGenerator::num_tasks
long num_tasks() const override
Definition: prediction.cpp:99

dismec::prediction::TopKPredictionTaskGenerator::prepare
void prepare(long num_threads, long chunk_size) override
Called to notify the TaskGenerator about the number of threads.
Definition: prediction.cpp:103

dismec::prediction::TopKPredictionTaskGenerator::m_ConfusionMatrix
std::array< std::int64_t, 4 > m_ConfusionMatrix
Definition: prediction.h:117

dismec::prediction::TopKPredictionTaskGenerator::m_TopKValues
PredictionMatrix m_TopKValues
Definition: prediction.h:108

dismec::prediction::TopKPredictionTaskGenerator::m_ThreadLocalConfusionMatrix
std::vector< std::array< std::int64_t, 4 > > m_ThreadLocalConfusionMatrix
Definition: prediction.h:114

dismec::types::GenericMatrixRef
Definition: eigen_generic.h:163

dismec::types::GenericMatrixRef::SparseRowMajorRef
Eigen::Ref< SparseRowMajor< T > > SparseRowMajorRef
Definition: eigen_generic.h:169

dismec::types::GenericMatrixRef::DenseRowMajorRef
Eigen::Ref< DenseRowMajor< T > > DenseRowMajorRef
Definition: eigen_generic.h:167

data.h

model.h

anonymous_namespace{prediction.cpp}::make_matrix
Model::FeatureMatrixIn make_matrix(const SparseFeatures &features, long begin, long end)
Definition: prediction.cpp:46

anonymous_namespace{py_data.cpp}::get_features
auto get_features(const DatasetBase &ds)
Definition: py_data.cpp:28

dismec::prediction
Definition: evaluate.h:14

dismec::types::visit
auto visit(F &&f, Variants &&... variants)
Definition: eigen_generic.h:95

dismec
Main namespace in which all types, classes, and functions are defined.
Definition: app.h:15

dismec::DenseFeatures
types::DenseRowMajor< real_t > DenseFeatures
Dense Feature Matrix in Row Major format.
Definition: matrix_types.h:58

dismec::SparseFeatures
types::SparseRowMajor< real_t > SparseFeatures
Sparse Feature Matrix in Row Major format.
Definition: matrix_types.h:50

dismec::real_t
float real_t
The default type for floating point values.
Definition: config.h:17

prediction.h