dismecpp/predict_8cpp_source.html

 // Copyright (c) 2021, Aalto University, developed by Erik Schultheis

 // All rights reserved.

 //

 // SPDX-License-Identifier: MIT


 #include "prediction/metrics.h"

 #include "parallel/runner.h"

 #include "prediction/prediction.h"

 #include "prediction/evaluate.h"

 #include "model/model.h"

 #include "data/data.h"

 #include "data/transform.h"

 #include "io/model-io.h"

 #include "io/prediction.h"

 #include "io/xmc.h"

 #include "CLI/CLI.hpp"

 #include "app.h"

 #include "spdlog/spdlog.h"

 #include "nlohmann/json.hpp"


 using namespace dismec;


 prediction::MacroMetricReporter* add_macro_metrics(prediction::EvaluateMetrics& metrics, int k) {

     auto* macro = metrics.add_macro_at_k(k);

     macro->add_coverage(0.0);

     macro->add_confusion_matrix();

     macro->add_precision(prediction::MacroMetricReporter::MACRO);

     macro->add_precision(prediction::MacroMetricReporter::MICRO);

     macro->add_recall(prediction::MacroMetricReporter::MACRO);

     macro->add_recall(prediction::MacroMetricReporter::MICRO);

     macro->add_f_measure(prediction::MacroMetricReporter::MACRO);

     macro->add_f_measure(prediction::MacroMetricReporter::MICRO);

     macro->add_accuracy(prediction::MacroMetricReporter::MICRO);

     macro->add_accuracy(prediction::MacroMetricReporter::MACRO);

     macro->add_balanced_accuracy(prediction::MacroMetricReporter::MICRO);

     macro->add_balanced_accuracy(prediction::MacroMetricReporter::MACRO);

     macro->add_specificity(prediction::MacroMetricReporter::MICRO);

     macro->add_specificity(prediction::MacroMetricReporter::MACRO);

     macro->add_informedness(prediction::MacroMetricReporter::MICRO);

     macro->add_informedness(prediction::MacroMetricReporter::MACRO);

     macro->add_markedness(prediction::MacroMetricReporter::MICRO);

     macro->add_markedness(prediction::MacroMetricReporter::MACRO);

     macro->add_fowlkes_mallows(prediction::MacroMetricReporter::MICRO);

     macro->add_fowlkes_mallows(prediction::MacroMetricReporter::MACRO);

     macro->add_negative_predictive_value(prediction::MacroMetricReporter::MICRO);

     macro->add_negative_predictive_value(prediction::MacroMetricReporter::MACRO);

     macro->add_matthews(prediction::MacroMetricReporter::MICRO);

     macro->add_matthews(prediction::MacroMetricReporter::MACRO);

     macro->add_positive_likelihood_ratio(prediction::MacroMetricReporter::MICRO);

     macro->add_positive_likelihood_ratio(prediction::MacroMetricReporter::MACRO);

     macro->add_negative_likelihood_ratio(prediction::MacroMetricReporter::MICRO);

     macro->add_negative_likelihood_ratio(prediction::MacroMetricReporter::MACRO);

     macro->add_diagnostic_odds_ratio(prediction::MacroMetricReporter::MICRO);

     macro->add_diagnostic_odds_ratio(prediction::MacroMetricReporter::MACRO);

     return macro;

 };


 void setup_metrics(prediction::EvaluateMetrics& metrics, int top_k) {

     metrics.add_precision_at_k(1);

     metrics.add_abandonment_at_k(1);

     metrics.add_dcg_at_k(1, false);

     metrics.add_dcg_at_k(1, true);


     add_macro_metrics(metrics, 1);


     if(top_k >= 3) {

         metrics.add_precision_at_k(3);

         metrics.add_abandonment_at_k(3);

         metrics.add_dcg_at_k(3, false);

         metrics.add_dcg_at_k(3, true);

         add_macro_metrics(metrics, 3);

     }

     if(top_k >= 5) {

         metrics.add_precision_at_k(5);

         metrics.add_abandonment_at_k(5);

         metrics.add_dcg_at_k(5, false);

         metrics.add_dcg_at_k(5, true);

         add_macro_metrics(metrics, 5);

     }

 }


 int main(int argc, const char** argv) {

     CLI::App app{"DiSMEC"};


     std::string problem_file;

     std::string model_file;

     std::string result_file;

     std::string labels_file;

     std::filesystem::path save_metrics;

     int threads = -1;

     int top_k = 5;

     bool save_as_npy = false;


     DataProcessing DataProc;

     DataProc.setup_data_args(app);


     app.add_option("model-file", model_file, "The file from which the model will be read.")->required()->check(CLI::ExistingFile);;

     app.add_option("result-file", result_file, "The file to which the predictions will be written.")->required();

     app.add_option("--threads", threads, "Number of threads to use. -1 means auto-detect");

     app.add_option("--save-metrics", save_metrics, "Target file in which the metric values are saved");

     app.add_option("--topk, --top-k", top_k, "Only the top k predictions will be saved. "

                                              "Set to -1 if you need all predictions. (Warning: This may result in very large files!)");

     app.add_flag("--save-as-npy", save_as_npy, "Save the predictions as a numpy file instead of plain text.");

     int Verbose = 0;

     app.add_flag("-v", Verbose);


     try {

         app.parse(argc, argv);

     } catch (const CLI::ParseError &e) {

         return app.exit(e);

     }


     auto test_set = DataProc.load(Verbose);


     parallel::ParallelRunner runner(threads);

     if(Verbose > 0)

         runner.set_logger(spdlog::default_logger());


     runner.set_chunk_size(PREDICTION_RUN_CHUNK_SIZE);


     if(top_k > 0) {

         io::PartialModelLoader loader(model_file, io::PartialModelLoader::DEFAULT);

         if(!loader.validate()) {

             return EXIT_FAILURE;

         }


         int wf_it  = 0;

         if(loader.num_weight_files() == 0) {

             spdlog::error("No weight files");

             return EXIT_FAILURE;

         }


         spdlog::info("Calculating top-{} predictions", top_k);


         // generate a transpose of the label matrix

         std::vector<std::vector<label_id_t>> examples_to_labels(test_set->num_examples());

         for(label_id_t label{0}; label.to_index() < test_set->num_labels(); ++label) {

             for(auto example : test_set->get_label_instances(label)) {

                 examples_to_labels[example].push_back(label);

             }

         }


         auto initial_model = loader.load_model(wf_it);

         spdlog::info("Using {} representation for model weights", initial_model->has_sparse_weights() ? "sparse" : "dense");


         prediction::TopKPredictionTaskGenerator task(test_set.get(), initial_model, top_k);

         while(true) {

             ++wf_it;

             auto preload_weights = std::async(std::launch::async, [iter=wf_it, &loader]() {

                 if(iter != loader.num_weight_files()) {

                     return loader.load_model(iter);

                 } else {

                     return std::shared_ptr<dismec::model::Model>{};

                 }

             });

             auto result = runner.run(task);

             if(!result.IsFinished) {

                 spdlog::error("Something went wrong, prediction computation was not finished!");

                 std::exit(1);

             }

             spdlog::info("Finished prediction in {}s", result.Duration.count());

             if(wf_it == loader.num_weight_files()) {

                 break;

             }

             task.update_model(preload_weights.get());

         }


         spdlog::info("Saving to '{}'", result_file);

         io::prediction::save_sparse_predictions(result_file,

                                                 task.get_top_k_values(),

                                                 task.get_top_k_indices());


         prediction::EvaluateMetrics metrics{&examples_to_labels, &task.get_top_k_indices(), test_set->num_labels()};

         setup_metrics(metrics, top_k);


         spdlog::info("Calculating metrics");

         runner.set_chunk_size(PREDICTION_METRICS_CHUNK_SIZE);

         auto result_info = runner.run(metrics);

         spdlog::info("Calculated metrics in {}ms", std::chrono::duration_cast<std::chrono::milliseconds>(result_info.Duration).count());


         // sort thew results and present them

         std::vector<std::pair<std::string, double>> results =metrics.get_metrics();

         std::sort(results.begin(), results.end());


         for(const auto& [name, value] : results ) {

             std::cout << fmt::format("{:15} = {:.4}", name, value) << "\n";

         }


         if(!save_metrics.empty()) {

             nlohmann::json data;

             for(const auto& [name, value] : results ) {

                 data[name] = value;

             }

             std::ofstream file(save_metrics);

             file << std::setw(4) << data;

         }


         const auto& cm = task.get_confusion_matrix();

         std::int64_t tp = cm[prediction::TopKPredictionTaskGenerator::TRUE_POSITIVES];

         std::int64_t fp = cm[prediction::TopKPredictionTaskGenerator::FALSE_POSITIVES];

         std::int64_t tn = cm[prediction::TopKPredictionTaskGenerator::TRUE_NEGATIVES];

         std::int64_t fn = cm[prediction::TopKPredictionTaskGenerator::FALSE_NEGATIVES];

         std::int64_t total = tp + fp + tn + fn;


         std::cout << fmt::format("Confusion matrix is: \n"

                      "TP: {:15L}   FP: {:15L}\n"

                      "FN: {:15L}   TN: {:15L}\n", tp, fp, fn, tn);


         // calculates a percentage with decimals for extremely large integers.

         // we do the division still as integers, with two additional digits,

         // and only then convert to floating point.

         auto percentage = [](std::int64_t enumerator, std::int64_t denominator) {

             std::int64_t base_result = (std::int64_t{10'000} * enumerator) / denominator;

             return double(base_result) / 100.0;

         };


         std::cout << fmt::format("Accuracy:     {:.3}%\n", percentage(tp + tn,  total));

         std::cout << fmt::format("Precision:    {:.3}%\n", percentage(tp, tp + fp));

         std::cout << fmt::format("Recall:       {:.3}%\n", percentage(tp, tp + fn));

         std::cout << fmt::format("F1:           {:.3}%\n", percentage(tp, tp + (fp + fn) / 2));


     } else {

         spdlog::info("Reading model file from '{}'", model_file);

         auto model = io::load_model(model_file);


         spdlog::info("Calculating full predictions");

         prediction::FullPredictionTaskGenerator task(test_set.get(), model);

         auto result = runner.run(task);

         if(!result.IsFinished) {

             spdlog::error("Something went wrong, prediction computation was not finished!");

             std::exit(1);

         }

         const auto& predictions = task.get_predictions();


         if(save_as_npy) {

             io::prediction::save_dense_predictions_as_npy(result_file, predictions);

         } else {

             io::prediction::save_dense_predictions_as_txt(result_file, predictions);

         }

     }

 }

app.h

dismec::DataProcessing
Definition: app.h:16

dismec::DataProcessing::load
std::shared_ptr< MultiLabelData > load(int verbose)
Definition: app.cpp:47

dismec::DataProcessing::setup_data_args
void setup_data_args(CLI::App &app)
Definition: app.cpp:14

dismec::io::model::PartialModelLoader
This class allows loading only a subset of the weights of a large model.
Definition: model-io.h:317

dismec::io::model::PartialModelLoader::load_model
std::shared_ptr< Model > load_model(label_id_t label_begin, label_id_t label_end) const
Loads part of the model.
Definition: model-io.cpp:373

dismec::io::model::PartialModelLoader::num_weight_files
long num_weight_files() const
Returns the number of availabel weight files.
Definition: model-io.cpp:395

dismec::io::model::PartialModelLoader::DEFAULT
@ DEFAULT
Definition: model-io.h:320

dismec::io::model::PartialModelLoader::validate
bool validate() const
Validates that all weight files exist.
Definition: model-io.cpp:441

dismec::label_id_t
Strong typedef for an int to signify a label id.
Definition: types.h:20

dismec::opaque_int_type::to_index
constexpr T to_index() const
! Explicitly convert to an integer.
Definition: opaque_int.h:32

dismec::parallel::ParallelRunner
Definition: runner.h:25

dismec::parallel::ParallelRunner::set_logger
void set_logger(std::shared_ptr< spdlog::logger > logger)
sets the logger object that is used for reporting. Set to nullptr for quiet mode.
Definition: runner.cpp:28

dismec::parallel::ParallelRunner::run
RunResult run(TaskGenerator &tasks, long start=0)
Definition: runner.cpp:39

dismec::parallel::ParallelRunner::set_chunk_size
void set_chunk_size(long chunk_size)
Definition: runner.cpp:24

dismec::prediction::MacroMetricReporter::add_coverage
void add_coverage(double threshold, std::string name={})
Definition: metrics.cpp:179

dismec::prediction::MacroMetricReporter::MACRO
@ MACRO
Definition: metrics.h:124

dismec::prediction::MacroMetricReporter::MICRO
@ MICRO
Definition: metrics.h:124

dismec::prediction::TopKPredictionTaskGenerator::TRUE_POSITIVES
static constexpr const int TRUE_POSITIVES
Definition: prediction.h:101

dismec::prediction::TopKPredictionTaskGenerator::TRUE_NEGATIVES
static constexpr const int TRUE_NEGATIVES
Definition: prediction.h:103

dismec::prediction::TopKPredictionTaskGenerator::FALSE_POSITIVES
static constexpr const int FALSE_POSITIVES
Definition: prediction.h:102

dismec::prediction::TopKPredictionTaskGenerator::FALSE_NEGATIVES
static constexpr const int FALSE_NEGATIVES
Definition: prediction.h:104

data.h

evaluate.h

prediction.h

metrics.h

json
nlohmann::json json
Definition: model-io.cpp:22

model-io.h

model.h

dismec::io::model::load_model
std::shared_ptr< Model > load_model(path source)
Definition: model-io.cpp:334

dismec::io::prediction::save_dense_predictions_as_txt
void save_dense_predictions_as_txt(const path &target, const PredictionMatrix &values)
Saves predictions as a dense txt matrix.

dismec::io::prediction::save_dense_predictions_as_npy
void save_dense_predictions_as_npy(const path &target, const PredictionMatrix &values)
Saves predictions as a dense npy file.

dismec::io::prediction::save_sparse_predictions
void save_sparse_predictions(const path &target, const PredictionMatrix &values, const IndexMatrix &indices)
Saves sparse predictions as a text file.

dismec
Main namespace in which all types, classes, and functions are defined.
Definition: app.h:15

dismec::PREDICTION_RUN_CHUNK_SIZE
constexpr const int PREDICTION_RUN_CHUNK_SIZE
Default chunk size for predicting scores.
Definition: config.h:39

dismec::PREDICTION_METRICS_CHUNK_SIZE
constexpr const int PREDICTION_METRICS_CHUNK_SIZE
Default chunk size for calculating metrics.
Definition: config.h:42

main
int main(int argc, const char **argv)
Definition: predict.cpp:84

add_macro_metrics
prediction::MacroMetricReporter * add_macro_metrics(prediction::EvaluateMetrics &metrics, int k)
Definition: predict.cpp:24

setup_metrics
void setup_metrics(prediction::EvaluateMetrics &metrics, int top_k)
Definition: predict.cpp:59

prediction.h

runner.h

transform.h

xmc.h