DiSMEC++
tfidf.cpp
Go to the documentation of this file.
1 //
2 // Created by erik on 28.1.2022.
3 //
4 
5 #include "io/xmc.h"
6 #include "data/data.h"
7 #include "data/transform.h"
8 #include "CLI/CLI.hpp"
9 #include "spdlog/spdlog.h"
10 #include "spdlog/stopwatch.h"
11 
12 using namespace dismec;
13 
14 void apply_tfidf(SparseFeatures& features, const DenseRealVector& idf) {
15  for(int row = 0; row < features.rows(); ++row) {
16  for(auto it = SparseFeatures::InnerIterator(features, row); it; ++it) {
17  it.valueRef() = (1 + std::log(it.valueRef())) * idf.coeff(it.col());
18  }
19  }
20 
21  normalize_instances(features);
22 }
23 
24 int main(int argc, const char** argv) {
25  std::string TrainSetFile;
26  std::string TestSetFile;
27  std::string OutputTrain;
28  std::string OutputTest;
29  bool OneBasedIndex = false;
30 // bool Reorder = false;
31  CLI::App app{"tfidf"};
32  app.add_option("train-set", TrainSetFile,
33  "The training dataset will be loaded from here.")->required()->check(CLI::ExistingFile);
34  auto* test_set_opt = app.add_option("--test-set", TestSetFile,
35  "The test dataset will be loaded from here. "
36  "If given, it will use the idf as calculated on the training set")->check(CLI::ExistingFile);
37  app.add_option("out", OutputTrain,
38  "The file to which the result (for the train set) will be saved.")->required();
39  app.add_option("--test-out", OutputTest,
40  "The file to which the result for the test set will be saved.")->needs(test_set_opt);
41 
42  app.add_flag("--one-based-index", OneBasedIndex,
43  "If this flag is given, then we assume that the input dataset in xmc format and"
44  " has one-based indexing, i.e. the first label and feature are at index 1 (as opposed to the usual 0)");
45 /*
46  * TODO
47  */
48 /*
49  app.add_flag("--reorder", Reorder,
50  "If given, the features will be reordered based on their frequency. For large feature matrices, this may result in better"
51  "performance for sparse matrix multiplications.");
52 */
53  try {
54  app.parse(argc, argv);
55  } catch (const CLI::ParseError &e) {
56  return app.exit(e);
57  }
58 
59  auto train_data = read_xmc_dataset(TrainSetFile, OneBasedIndex ? io::IndexMode::ONE_BASED : io::IndexMode::ZERO_BASED);
60  spdlog::info("Read dataset from {} with {} instances and {} features.", TrainSetFile, train_data.num_examples(), train_data.num_features());
61  auto& train_features = train_data.edit_features()->sparse();
62 
63  spdlog::stopwatch timer;
64  auto ftr_count = count_features(train_features);
65 
66  // then rescale by idf
67  DenseRealVector scale = DenseRealVector::NullaryExpr(ftr_count.size(), 1,
68  [&](Eigen::Index i){ return std::log(train_features.rows() / std::max(1l, ftr_count[i])); });
69 
70  apply_tfidf(train_features, scale);
71  spdlog::info("Applied tfidf transform in {:.3}s.", timer);
72 
73  timer.reset();
74  io::save_xmc_dataset(OutputTrain, train_data);
75  spdlog::info("Saved dataset to {} in {:.3}s.", OutputTrain, timer);
76 
77  if(!TestSetFile.empty()) {
78  spdlog::info("Processing test dataset");
79  auto test_data = read_xmc_dataset(TestSetFile, OneBasedIndex ? io::IndexMode::ONE_BASED : io::IndexMode::ZERO_BASED);
80  auto& test_features = test_data.edit_features()->sparse();
81  timer.reset();
82  apply_tfidf(test_features, scale);
83  spdlog::info("Applied tfidf transform to test data in {:.3}s.", timer);
84 
85  timer.reset();
86  io::save_xmc_dataset(OutputTest, test_data);
87  spdlog::info("Saved test data to {} in {:.3}s.", OutputTest, timer);
88  }
89 }
MultiLabelData read_xmc_dataset(const std::filesystem::path &source, IndexMode mode=IndexMode::ZERO_BASED)
Reads a dataset given in the extreme multilabel classification format.
Definition: xmc.cpp:216
void save_xmc_dataset(std::ostream &target, const MultiLabelData &data)
Saves the given dataset in XMC format.
Definition: xmc.cpp:294
@ ONE_BASED
labels and feature indices are 1, 2, ..., num
@ ZERO_BASED
labels and feature indices are 0, 1, ..., num - 1
Main namespace in which all types, classes, and functions are defined.
Definition: app.h:15
void normalize_instances(DatasetBase &data)
Definition: transform.cpp:88
types::DenseVector< real_t > DenseRealVector
Any dense, real values vector.
Definition: matrix_types.h:40
std::vector< long > count_features(const SparseFeatures &features)
Definition: transform.cpp:114
types::SparseRowMajor< real_t > SparseFeatures
Sparse Feature Matrix in Row Major format.
Definition: matrix_types.h:50
int main(int argc, const char **argv)
Definition: tfidf.cpp:24
void apply_tfidf(SparseFeatures &features, const DenseRealVector &idf)
Definition: tfidf.cpp:14