9 #include "spdlog/spdlog.h"
10 #include "spdlog/stopwatch.h"
15 for(
int row = 0; row < features.rows(); ++row) {
16 for(
auto it = SparseFeatures::InnerIterator(features, row); it; ++it) {
17 it.valueRef() = (1 + std::log(it.valueRef())) * idf.coeff(it.col());
24 int main(
int argc,
const char** argv) {
25 std::string TrainSetFile;
26 std::string TestSetFile;
27 std::string OutputTrain;
28 std::string OutputTest;
29 bool OneBasedIndex =
false;
31 CLI::App app{
"tfidf"};
32 app.add_option(
"train-set", TrainSetFile,
33 "The training dataset will be loaded from here.")->required()->check(CLI::ExistingFile);
34 auto* test_set_opt = app.add_option(
"--test-set", TestSetFile,
35 "The test dataset will be loaded from here. "
36 "If given, it will use the idf as calculated on the training set")->check(CLI::ExistingFile);
37 app.add_option(
"out", OutputTrain,
38 "The file to which the result (for the train set) will be saved.")->required();
39 app.add_option(
"--test-out", OutputTest,
40 "The file to which the result for the test set will be saved.")->needs(test_set_opt);
42 app.add_flag(
"--one-based-index", OneBasedIndex,
43 "If this flag is given, then we assume that the input dataset in xmc format and"
44 " has one-based indexing, i.e. the first label and feature are at index 1 (as opposed to the usual 0)");
54 app.parse(argc, argv);
55 }
catch (
const CLI::ParseError &e) {
60 spdlog::info(
"Read dataset from {} with {} instances and {} features.", TrainSetFile, train_data.num_examples(), train_data.num_features());
61 auto& train_features = train_data.edit_features()->sparse();
63 spdlog::stopwatch timer;
67 DenseRealVector scale = DenseRealVector::NullaryExpr(ftr_count.size(), 1,
68 [&](Eigen::Index i){ return std::log(train_features.rows() / std::max(1l, ftr_count[i])); });
71 spdlog::info(
"Applied tfidf transform in {:.3}s.", timer);
75 spdlog::info(
"Saved dataset to {} in {:.3}s.", OutputTrain, timer);
77 if(!TestSetFile.empty()) {
78 spdlog::info(
"Processing test dataset");
80 auto& test_features = test_data.edit_features()->sparse();
83 spdlog::info(
"Applied tfidf transform to test data in {:.3}s.", timer);
87 spdlog::info(
"Saved test data to {} in {:.3}s.", OutputTest, timer);
MultiLabelData read_xmc_dataset(const std::filesystem::path &source, IndexMode mode=IndexMode::ZERO_BASED)
Reads a dataset given in the extreme multilabel classification format.
void save_xmc_dataset(std::ostream &target, const MultiLabelData &data)
Saves the given dataset in XMC format.
@ ONE_BASED
labels and feature indices are 1, 2, ..., num
@ ZERO_BASED
labels and feature indices are 0, 1, ..., num - 1
Main namespace in which all types, classes, and functions are defined.
void normalize_instances(DatasetBase &data)
types::DenseVector< real_t > DenseRealVector
Any dense, real values vector.
std::vector< long > count_features(const SparseFeatures &features)
types::SparseRowMajor< real_t > SparseFeatures
Sparse Feature Matrix in Row Major format.
int main(int argc, const char **argv)
void apply_tfidf(SparseFeatures &features, const DenseRealVector &idf)