DiSMEC++
cascade.cpp
Go to the documentation of this file.
1 // Copyright (c) 2021, Aalto University, developed by Erik Schultheis
2 // All rights reserved.
3 //
4 // SPDX-License-Identifier: MIT
5 
6 #include "parallel/runner.h"
7 #include "io/model-io.h"
8 #include "io/xmc.h"
9 #include "io/slice.h"
10 #include "data/data.h"
11 #include "data/transform.h"
12 #include "training/training.h"
13 #include "training/weighting.h"
14 #include "training/postproc.h"
15 #include "training/initializer.h"
16 #include "training/statistics.h"
17 #include "CLI/CLI.hpp"
18 #include "spdlog/spdlog.h"
19 #include "io/numpy.h"
20 #include "io/common.h"
21 #include "spdlog/stopwatch.h"
22 #include <future>
23 
24 using namespace dismec;
25 
27 public:
29  int run(int argc, const char** argv);
30 private:
31  CLI::App app{"DiSMEC-Cascade"};
32 
33  // command line parameters
34  // source data
35  void setup_source_cmdline();
36  std::string TfIdfFile;
37  std::string DenseFile;
38  std::string ShortlistFile;
39 
40  // target model
41  void setup_save_cmdline();
42  std::filesystem::path ModelFile;
44 
45  // run range
46  void setup_label_range();
47  int FirstLabel = 0;
48  int NumLabels = -1;
49  bool ContinueRun = false;
50 
51  void parse_label_range();
52  label_id_t LabelsBegin{0};
53  label_id_t LabelsEnd{-1};
54 
55  CLI::Option* FirstLabelOpt;
56  CLI::Option* NumLabelsOpt;
57 
58  // hyper params
59  void setup_hyper_params();
61 
62  // statistics
63  std::string StatsOutFile = "stats.json";
64  std::string StatsLevelFile = {};
65 
66  // normalization
67  bool NormalizeSparse = false;
68  bool NormalizeDense = false;
70 
71  // initialization
72  std::filesystem::path DenseWeightsFile;
73  std::filesystem::path DenseBiasesFile;
74  bool InitSparseMSI = false;
75  bool InitDenseMSI = false;
76 
77  // regularization
78  real_t RegScaleSparse = 1.0;
79  real_t RegScaleDense = 1.0;
80 
81  // bias
82  bool AugmentDenseWithBias = false;
83  bool AugmentSparseWithBias = false;
84 
85 
86  // others
87  long NumThreads = -1;
88  long Timeout = -1;
89  long BatchSize = -1;
90  std::filesystem::path ExportProcessedData;
91 
92  int Verbose = 0;
93 
94  // config setup helpers
95  CascadeTrainingConfig make_config(const std::shared_ptr<MultiLabelData>& data, std::shared_ptr<const GenericFeatureMatrix> dense);
96 };
97 
98 int main(int argc, const char** argv) {
99  //openblas_set_num_threads(1);
100  TrainingProgram program;
101  program.run(argc, argv);
102 }
103 
105 {
106  SaveOptions.Format = io::model::WeightFormat::SPARSE_TXT;
107  SaveOptions.Culling = 0.01;
108 
109  app.add_option("output,--model-file", ModelFile,
110  "The file to which the model will be written. Note that models are saved in multiple different files, so this"
111  "just specifies the base name of the metadata file.")->required();
112 
113  app.add_option("--weight-culling", SaveOptions.Culling,
114  "When saving in a sparse format, any weight lower than this will be omitted.")->check(CLI::NonNegativeNumber);
115 
116  app.add_option("--save-precision", SaveOptions.Precision,
117  "The number of digits to write for real numbers in text file format.")->check(CLI::NonNegativeNumber);
118 
119 }
120 
122  app.add_option("tfidf-file", TfIdfFile,
123  "The file from which the tfidf data will be loaded.")->required()->check(CLI::ExistingFile);
124  app.add_option("dense-file", DenseFile,
125  "The file from which the dense data will be loaded.")->required()->check(CLI::ExistingFile);
126  app.add_option("--shortlist", ShortlistFile,
127  "A file containing the shortlist of hard-negative instances for each label.")->check(CLI::ExistingFile);
128 
129 }
130 
132  FirstLabelOpt = app.add_option("--first-label", FirstLabel,
133  "If you want to train only a subset of labels, this is the id of the first label to be trained."
134  "The subset of labels trained is `[first_label, first_label + num-labels)`")->check(CLI::NonNegativeNumber);
135  NumLabelsOpt = app.add_option("--num-labels", NumLabels,
136  "If you want to train only a subset of labels, this is the total number of labels to be trained.")->check(CLI::NonNegativeNumber);
137  app.add_flag("--continue", ContinueRun,
138  "If this flag is given, the new weights will be appended to the model "
139  "file, instead of overwriting it. You can use the --first-label option to explicitly specify "
140  "at which label to start. If omitted, training starts at the first label for which no "
141  "weight vector is known.");
142 }
143 
145 {
146  // this needs to be set in all cases, because we need to adapt it dynamically and thus cannot rely on
147  // default values
148  hps.set("epsilon", 0.01);
149 
150  auto add_hyper_param_option = [&](const char* option, const char* name, const char* desc) {
151  return app.add_option_function<double>(
152  option,
153  [this, name](double value) { hps.set(name, value); },
154  desc)->group("hyper-parameters");
155  };
156 
157  add_hyper_param_option("--epsilon", "epsilon",
158  "Tolerance for the minimizer. Will be adjusted by the number of positive/negative instances")
159  ->check(CLI::NonNegativeNumber);
160 
161  add_hyper_param_option("--alpha-pcg", "alpha-pcg",
162  "Interpolation parameter for preconditioning of CG optimization.")->check(CLI::Range(0.0, 1.0));
163 
164  add_hyper_param_option("--line-search-step-size", "search.step-size",
165  "Step size for the line search.")->check(CLI::NonNegativeNumber);
166 
167  add_hyper_param_option("--line-search-alpha", "search.alpha",
168  "Shrink factor for updating the line search step")->check(CLI::Range(0.0, 1.0));
169 
170  add_hyper_param_option("--line-search-eta", "search.eta",
171  "Acceptance criterion for the line search")->check(CLI::Range(0.0, 1.0));
172  add_hyper_param_option("--cg-epsilon", "cg.epsilon",
173  "Stopping criterion for the CG solver")->check(CLI::PositiveNumber);
174 
175  app.add_option_function<long>(
176  "--max-steps",
177  [this](long value) { hps.set("max-steps", value); },
178  "Maximum number of newton steps.")->check(CLI::PositiveNumber)->group("hyper-parameters");
179 
180  app.add_option_function<long>(
181  "--line-search-max-steps",
182  [this](long value) { hps.set("search.max-steps", value); },
183  "Maximum number of line search steps.")->check(CLI::PositiveNumber)->group("hyper-parameters");
184 }
185 
187 {
188  // continue with automatic first label selection
189  if(ContinueRun)
190  {
191  io::PartialModelSaver saver(ModelFile, SaveOptions, true);
192  if(FirstLabelOpt->count() == 0)
193  {
194  auto missing = saver.get_missing_weights();
195  spdlog::info("Model is missing weight vectors {} to {}.", missing.first.to_index(), missing.second.to_index() - 1);
196  LabelsBegin = missing.first;
197  LabelsEnd = missing.second;
198  if (NumLabelsOpt->count() > 0) {
199  if (LabelsEnd - LabelsBegin >= NumLabels) {
200  LabelsEnd = LabelsBegin + NumLabels;
201  } else {
202  spdlog::warn("Number of labels to train was specified as {}, but only {} labels will be trained",
203  NumLabels, LabelsEnd - LabelsBegin);
204  }
205  }
206  return;
207  } else {
208  // user has given us a label from which to start.
209  LabelsBegin = label_id_t{FirstLabel};
210  if (NumLabelsOpt->count() > 0) {
211  LabelsBegin = LabelsBegin + NumLabels;
212  // and a label count. Then we need to check is this is valid
213  if(saver.any_weight_vector_for_interval(LabelsBegin, LabelsEnd)) {
214  spdlog::error("Specified continuation of training weight vectors for labels {}-{}, "
215  "which overlaps with existing weight vectors.", LabelsBegin.to_index(), LabelsEnd.to_index()-1);
216  exit(EXIT_FAILURE);
217  }
218  return;
219  }
220  LabelsEnd = label_id_t{saver.num_labels()};
221  return;
222  }
223  }
224 
225  // OK, we are not continuing a run.
226 
227  if(FirstLabelOpt->count()) {
228  LabelsBegin = label_id_t{FirstLabel};
229  } else {
230  LabelsBegin = label_id_t{0};
231  }
232 
233  if (NumLabelsOpt->count() > 0) {
234  LabelsEnd = LabelsBegin + NumLabels;
235  } else {
236  LabelsEnd = label_id_t{-1};
237  }
238 }
239 
241  setup_source_cmdline();
242  setup_save_cmdline();
243  setup_label_range();
244  setup_hyper_params();
245 
246  app.add_option("--threads", NumThreads, "Number of threads to use. -1 means auto-detect");
247  app.add_option("--batch-size", BatchSize, "If this is given, training is split into batches "
248  "and results are written to disk after each batch.");
249  app.add_option("--timeout", Timeout, "No new training tasks will be started after this time. "
250  "This can be used e.g. on a cluster system to ensure that the training finishes properly "
251  "even if not all work could be done in the allotted time.")
252  ->transform(CLI::AsNumberWithUnit(std::map<std::string, float>{{"ms", 1},
253  {"s", 1'000}, {"sec", 1'000},
254  {"m", 60'000}, {"min", 60'000},
255  {"h", 60*60'000}},
256  CLI::AsNumberWithUnit::UNIT_REQUIRED, "TIME"));
257 
258  app.add_option("--record-stats", StatsLevelFile,
259  "Record some statistics and save to file. The argument is a json file which describes which statistics are gathered.")
260  ->check(CLI::ExistingFile);
261  app.add_option("--stats-file", StatsOutFile, "Target file for recorded statistics");
262  app.add_option("--init-dense-weights", DenseWeightsFile, "File from which the initial weights for the dense part will be loaded.")->check(CLI::ExistingFile);
263  app.add_option("--init-dense-biases", DenseBiasesFile, "File from which the initial biases for the dense part will be loaded.")->check(CLI::ExistingFile);
264  app.add_flag("--init-sparse-msi", InitSparseMSI, "If this flag is given, then the sparse part will use mean-separating initialization.");
265  app.add_flag("--init-dense-msi", InitDenseMSI, "If this flag is given, then the dense part will use mean-separating initialization.");
266 
267  app.add_option("--sparse-reg-scale", RegScaleSparse, "Scaling factor for the sparse-part regularizer")->check(CLI::NonNegativeNumber);
268  app.add_option("--dense-reg-scale", RegScaleDense, "Scaling factor for the dense-part regularizer")->check(CLI::NonNegativeNumber);
269  app.add_flag("--normalize-dense", NormalizeDense, "Normalize the dense part of the feature matrix");
270  app.add_flag("--normalize-sparse", NormalizeSparse, "Normalize the sparse part of the feature matrix");
271  app.add_option("--transform-sparse", TransformSparse, "Apply a transformation to the sparse features.")->default_str("identity")
272  ->transform(CLI::Transformer(std::map<std::string, DatasetTransform>{
273  {"identity", DatasetTransform::IDENTITY},
274  {"log-one-plus", DatasetTransform::LOG_ONE_PLUS},
275  {"one-plus-log", DatasetTransform::ONE_PLUS_LOG},
276  {"sqrt", DatasetTransform::SQRT}
277  },CLI::ignore_case));
278 
279  app.add_flag("--augment-dense-bias", AugmentDenseWithBias, "Add an additional feature column to the dense matrix with values one.");
280  app.add_flag("--augment-sparse-bias", AugmentSparseWithBias, "Add an additional feature column to the sparse matrix with values one.");
281 
282  app.add_option("--export-dataset", ExportProcessedData,
283  "Exports the preprocessed dataset to the given file.");
284  app.add_flag("-v,-q{-1}", Verbose);
285 }
286 
287 MultiLabelData join_data(const std::shared_ptr<MultiLabelData>& data,
288  std::shared_ptr<const GenericFeatureMatrix> dense_data) {
289 
290  const SparseFeatures& sparse = data->get_features()->sparse();
291  const DenseFeatures& dense = dense_data->dense();
292 
293  SparseFeatures new_sparse(data->num_examples(), data->num_features() + dense.cols());
294  new_sparse.reserve(sparse.nonZeros() + dense.size());
295  for (int k=0; k < data->num_examples(); ++k) {
296  new_sparse.startVec(k);
297  for (DenseFeatures::InnerIterator it(dense, k); it; ++it) {
298  new_sparse.insertBack(it.row(), it.col()) = it.value();
299  }
300  for (SparseFeatures::InnerIterator it(sparse, k); it; ++it) {
301  new_sparse.insertBack(it.row(), it.col() + dense.cols()) = it.value();
302  }
303  }
304  new_sparse.finalize();
305  return {new_sparse, data->all_labels()};
306 }
307 
308 CascadeTrainingConfig TrainingProgram::make_config(const std::shared_ptr<MultiLabelData>& data,
309  std::shared_ptr<const GenericFeatureMatrix> dense) {
310  CascadeTrainingConfig config;
311 
312  if(InitSparseMSI)
313  config.SparseInit = init::create_feature_mean_initializer(data, 1.0, -2.0);
314 
315  if(!DenseWeightsFile.empty()) {
316  if(InitDenseMSI) {
317  spdlog::error("Cannot use MSI and pretrained weights at the same time!");
318  exit(EXIT_FAILURE);
319  }
320  if(DenseBiasesFile.empty()) {
321  config.DenseInit = init::create_numpy_initializer(DenseWeightsFile, {});
322  } else {
323  config.DenseInit = init::create_numpy_initializer(DenseWeightsFile, DenseBiasesFile);
324  }
325  } else if(InitDenseMSI) {
326  auto dense_ds = std::make_shared<MultiLabelData>(dense->dense(), data->all_labels());
327  config.DenseInit = init::create_feature_mean_initializer(dense_ds, 1.0, -2.0);
328  }
329 
330  config.StatsGatherer = std::make_shared<TrainingStatsGatherer>(StatsLevelFile, StatsOutFile);
331 
332  return config;
333 }
334 
335 int TrainingProgram::run(int argc, const char** argv)
336 {
337  try {
338  app.parse(argc, argv);
339  } catch (const CLI::ParseError &e) {
340  return app.exit(e);
341  }
342 
343  // check validity of save location
344  auto parent = std::filesystem::absolute(ModelFile).parent_path();
345  if(!std::filesystem::exists(parent)) {
346  spdlog::warn("Save directory '{}' does not exist. Trying to create it.", parent.c_str());
347  std::filesystem::create_directories(parent);
348  if(!std::filesystem::exists(parent)) {
349  spdlog::error("Could not create directory -- exiting.");
350  return EXIT_FAILURE;
351  }
352  }
353 
354  // TODO At this point, we know that the target directory exists, but not whether it is writeable.
355  // still, it's a start.
356 
357 
358  auto start_time = std::chrono::steady_clock::now();
359  auto timeout_time = start_time + std::chrono::milliseconds(Timeout);
360 
361  spdlog::info("Loading training data from file '{}'", TfIdfFile);
362  auto data = std::make_shared<MultiLabelData>([&]() {
363  return read_xmc_dataset(TfIdfFile, io::IndexMode::ZERO_BASED);
364  } ());
365 
366  if(TransformSparse != DatasetTransform::IDENTITY) {
367  spdlog::info("Applying data transformation");
368  transform_features(*data, TransformSparse);
369  }
370 
371  if(NormalizeSparse) {
372  spdlog::stopwatch timer;
373  normalize_instances(*data);
374  spdlog::info("Normalized sparse features in {:.3} seconds.", timer);
375  }
376  if(AugmentSparseWithBias) {
377  spdlog::stopwatch timer;
379  spdlog::info("Added bias column to sparse features in {:.3} seconds.", timer);
380  }
381  //auto permute = sort_features_by_frequency(*data);
382 
383  auto dense_data = std::make_shared<GenericFeatureMatrix>(io::load_matrix_from_npy(DenseFile));
384  if(NormalizeDense) {
385  spdlog::stopwatch timer;
386  normalize_instances(dense_data->dense());
387  spdlog::info("Normalized dense features in {:.3} seconds.", timer);
388  }
389 
390  if(AugmentDenseWithBias) {
391  spdlog::stopwatch timer;
392  augment_features_with_bias(dense_data->dense());
393  spdlog::info("Added bias column to dense features in {:.3} seconds.", timer);
394  }
395  // dense_data->dense().setZero();
396 
397  if(!ExportProcessedData.empty()) {
398  spdlog::stopwatch timer;
399  auto exported = join_data(data, dense_data);
400  io::save_xmc_dataset(ExportProcessedData, exported, 6);
401  spdlog::info("Saved preprocessed data to {} in {:.3} seconds", ExportProcessedData.string(), timer);
402  exit(0);
403  }
404 
405  std::shared_ptr<const std::vector<std::vector<long>>> shortlist;
406  if(!ShortlistFile.empty()) {
407  auto stream = std::fstream(ShortlistFile, std::fstream::in);
408  auto result = io::read_binary_matrix_as_lol(stream);
409  if(result.NumCols != data->num_labels()) {
410  spdlog::error("Mismatch between number of labels in shortlist {} and in dataset {}",
411  result.NumCols, data->num_labels());
412  exit(1);
413  }
414  if(result.NumRows != data->num_examples()) {
415  spdlog::error("Mismatch between number of examples in shortlist {} and in dataset {}",
416  result.NumRows, data->num_examples());
417  exit(1);
418  }
419 
420  shortlist = std::make_shared<std::vector<std::vector<long>>>(std::move(result.NonZeros));
421  }
422 
423  parse_label_range();
424 
425  auto runner = parallel::ParallelRunner(NumThreads);
426  if(Verbose > 0)
427  runner.set_logger(spdlog::default_logger());
428 
429  auto config = make_config(data, dense_data);
430 
431  std::shared_ptr<postproc::PostProcessFactory> post_proc = postproc::create_culling(SaveOptions.Culling);
432  SaveOptions.Culling = 1e-10;
433 
434  if(BatchSize <= 0) {
435  BatchSize = data->num_labels();
436  }
437 
438  if(Verbose >= 0) {
439  spdlog::info("handled preprocessing in {} seconds",
440  std::chrono::duration_cast<std::chrono::seconds>(std::chrono::steady_clock::now() - start_time).count() );
441  }
442 
443  // batched training
444  spdlog::info("Start training");
445  io::PartialModelSaver saver(ModelFile, SaveOptions, ContinueRun);
446  label_id_t first_label = LabelsBegin;
447  if(LabelsEnd == label_id_t{-1}) {
448  LabelsEnd = label_id_t{data->num_labels()};
449  }
450  label_id_t next_label = std::min(LabelsEnd, first_label + BatchSize);
451  std::future<io::model::WeightFileEntry> saving;
452 
453  config.PostProcessing = post_proc;
454  config.DenseReg = RegScaleDense;
455  config.SparseReg = RegScaleSparse;
456 
457  while(true) {
458  spdlog::info("Starting batch {} - {}", first_label.to_index(), next_label.to_index());
459 
460  // update time limit to respect remaining time
461  runner.set_time_limit(std::chrono::duration_cast<std::chrono::milliseconds>(timeout_time - std::chrono::steady_clock::now()));
462 
463  std::shared_ptr<TrainingSpec> train_spec = create_cascade_training(data, dense_data, shortlist, hps, config);
464  if(Verbose >= 0) {
465  train_spec->set_logger(spdlog::default_logger());
466  }
467  auto result = run_training(runner, train_spec,
468  first_label, next_label);
469 
470  /* do async saving. This has some advantages and some drawbacks:
471  + all the i/o latency will be interleaved with actual new computation and we don't waste much time
472  in this essentially non-parallel code
473  - we may overcommit the processor. If run_training uses all cores, then we will spawn an additional thread
474  here
475  - increased memory consumption. Instead of 1 model, we need to keep 2 in memory at the same time: The one
476  that is currently worked on and the one that is still being saved.
477  */
478  // make sure we don't interleave saving, as we don't do any locking in `saver`. Also, throw any exception
479  // that happened during the saving
480  if(saving.valid()) {
481  saving.get();
482  // saving weights has finished, we can update the meta data
483  saver.update_meta_file();
484  }
485 
486  saving = saver.add_model(result.Model);
487 
488  first_label = next_label;
489  if(first_label == LabelsEnd) {
490  // wait for the last saving process to finish
491  saving.get();
492  saver.update_meta_file();
493  break;
494  }
495  next_label = std::min(LabelsEnd, first_label + BatchSize);
496  // special case -- if the remaining labels are less than half a batch, we add them to this
497  // batch
498  if(next_label + BatchSize/2 > LabelsEnd) {
499  next_label = LabelsEnd;
500  }
501  }
502 
503  spdlog::info("program finished after {} seconds", std::chrono::duration_cast<std::chrono::seconds>(std::chrono::steady_clock::now() - start_time).count() );
504 
505  return EXIT_SUCCESS;
506 }
int main(int argc, const char **argv)
Definition: cascade.cpp:98
MultiLabelData join_data(const std::shared_ptr< MultiLabelData > &data, std::shared_ptr< const GenericFeatureMatrix > dense_data)
Definition: cascade.cpp:287
void setup_source_cmdline()
Definition: cascade.cpp:121
io::model::SaveOption SaveOptions
Definition: cascade.cpp:43
std::string ShortlistFile
Definition: cascade.cpp:38
HyperParameters hps
Definition: cascade.cpp:60
CLI::Option * FirstLabelOpt
Definition: cascade.cpp:55
std::filesystem::path DenseBiasesFile
Definition: cascade.cpp:73
std::string TfIdfFile
Definition: cascade.cpp:36
void setup_label_range()
Definition: cascade.cpp:131
void parse_label_range()
Definition: cascade.cpp:186
std::filesystem::path ModelFile
Definition: cascade.cpp:42
void setup_hyper_params()
Definition: cascade.cpp:144
std::filesystem::path DenseWeightsFile
Definition: cascade.cpp:72
std::string DenseFile
Definition: cascade.cpp:37
std::filesystem::path ExportProcessedData
Definition: cascade.cpp:90
void setup_save_cmdline()
Definition: cascade.cpp:104
CLI::Option * NumLabelsOpt
Definition: cascade.cpp:56
CascadeTrainingConfig make_config(const std::shared_ptr< MultiLabelData > &data, std::shared_ptr< const GenericFeatureMatrix > dense)
Definition: cascade.cpp:308
int run(int argc, const char **argv)
Definition: cascade.cpp:335
std::shared_ptr< const GenericFeatureMatrix > get_features() const
get a shared pointer to the (immutable) feature data
Definition: data.cpp:39
This class represents a set of hyper-parameters.
Definition: hyperparams.h:241
long num_labels() const noexcept
Gets the total number of labels.
Definition: model-io.h:158
Manage saving a model consisting of multiple partial models.
Definition: model-io.h:236
bool any_weight_vector_for_interval(label_id_t begin, label_id_t end) const
Checks if there are any weight vectors for the given interval.
Definition: model-io.cpp:255
std::pair< label_id_t, label_id_t > get_missing_weights() const
Get an interval labels for which weights are missing.
Definition: model-io.cpp:292
std::future< WeightFileEntry > add_model(const std::shared_ptr< const Model > &model, const std::optional< std::string > &file_path={})
Adds the weights of a partial model asynchronously.
Definition: model-io.cpp:172
void update_meta_file()
Updates the metadata file.
Definition: model-io.cpp:234
Strong typedef for an int to signify a label id.
Definition: types.h:20
constexpr T to_index() const
! Explicitly convert to an integer.
Definition: opaque_int.h:32
building blocks for io procedures that are used by multiple io subsystems
std::shared_ptr< WeightInitializationStrategy > create_feature_mean_initializer(std::shared_ptr< DatasetBase > data, real_t pos=1, real_t neg=-2)
Creates an initialization strategy based on the mean of positive and negative features.
Definition: msi.cpp:90
std::shared_ptr< WeightInitializationStrategy > create_numpy_initializer(const std::filesystem::path &weights, std::optional< std::filesystem::path > biases)
Creates an initialization strategy that uses weights loaded from a npy file.
Definition: numpy.cpp:58
@ SPARSE_TXT
Sparse Text Format
MultiLabelData read_xmc_dataset(const std::filesystem::path &source, IndexMode mode=IndexMode::ZERO_BASED)
Reads a dataset given in the extreme multilabel classification format.
Definition: xmc.cpp:216
void save_xmc_dataset(std::ostream &target, const MultiLabelData &data)
Saves the given dataset in XMC format.
Definition: xmc.cpp:294
LoLBinarySparse read_binary_matrix_as_lol(std::istream &source)
Definition: common.cpp:76
@ ZERO_BASED
labels and feature indices are 0, 1, ..., num - 1
types::DenseRowMajor< real_t > load_matrix_from_npy(std::istream &source)
Loads a matrix from a numpy array.
Definition: numpy.cpp:342
FactoryPtr create_culling(real_t eps)
Definition: postproc.cpp:54
Main namespace in which all types, classes, and functions are defined.
Definition: app.h:15
types::DenseRowMajor< real_t > DenseFeatures
Dense Feature Matrix in Row Major format.
Definition: matrix_types.h:58
TrainingResult run_training(parallel::ParallelRunner &runner, std::shared_ptr< TrainingSpec > spec, label_id_t begin_label=label_id_t{0}, label_id_t end_label=label_id_t{-1})
Definition: training.cpp:122
void normalize_instances(DatasetBase &data)
Definition: transform.cpp:88
void augment_features_with_bias(DatasetBase &data, real_t bias=1)
Definition: transform.cpp:25
DatasetTransform
Definition: transform.h:37
void transform_features(DatasetBase &data, DatasetTransform transform)
Definition: transform.cpp:152
types::SparseRowMajor< real_t > SparseFeatures
Sparse Feature Matrix in Row Major format.
Definition: matrix_types.h:50
std::shared_ptr< TrainingSpec > create_cascade_training(std::shared_ptr< const DatasetBase > data, std::shared_ptr< const GenericFeatureMatrix > dense, std::shared_ptr< const std::vector< std::vector< long >>> shortlist, HyperParameters params, CascadeTrainingConfig config)
Definition: cascade.cpp:161
float real_t
The default type for floating point values.
Definition: config.h:17
std::shared_ptr< TrainingStatsGatherer > StatsGatherer
Definition: spec.h:159
std::shared_ptr< init::WeightInitializationStrategy > DenseInit
Definition: spec.h:156
std::shared_ptr< init::WeightInitializationStrategy > SparseInit
Definition: spec.h:157