10 #include <spdlog/spdlog.h>
16 "The file from which the data will be loaded.")->required()->check(CLI::ExistingFile);
19 "If this flag is given, then we assume that the input dataset in xmc format"
20 " has one-based indexing, i.e. the first label and feature are at index 1 (as opposed to the usual 0)");
22 "If this flag is given, then all training examples will be augmented with an additional"
23 "feature of value 1 or the specified value.")->default_val(1.0);
25 "If this flag is given, then the feature vectors of all instances are normalized to one.");
26 app.add_option(
"--transform",
TransformData,
"Apply a transformation to the features of the dataset.")->default_str(
"identity")
27 ->transform(CLI::Transformer(std::map<std::string, DatasetTransform>{
34 app.add_option(
"--label-file",
LabelFile,
"For SLICE-type datasets, this specifies where the labels can be found")->check(CLI::ExistingFile);
37 auto* hash_option = app.add_flag(
"--hash-features",
"If this Flag is given, then feature hashing is performed.");
38 auto* bucket_option = app.add_option(
"--hash-buckets",
HashBuckets,
"Number of buckets for each hash function when feature hashing is enabled.")
39 ->needs(hash_option)->check(CLI::PositiveNumber);
40 app.add_option(
"--hash-repeat",
HashRepeats,
"Number of hash functions to use for feature hashing.")
41 ->needs(hash_option)->default_val(32)->check(CLI::PositiveNumber);
42 app.add_option(
"--hash-seed",
HashSeed,
"Seed to use when feature hashing.")
43 ->needs(hash_option)->default_val(42);
44 hash_option->needs(bucket_option);
49 spdlog::info(
"Loading training data from file '{}'",
DataSetFile);
51 auto data = std::make_shared<MultiLabelData>([&]() {
53 return read_xmc_dataset(DataSetFile, OneBasedIndex ? io::IndexMode::ONE_BASED : io::IndexMode::ZERO_BASED);
55 return io::read_slice_dataset(DataSetFile, LabelFile);
60 if(!data->get_features()->is_sparse()) {
61 spdlog::error(
"Feature hashing is currently only implemented for sparse features.");
64 spdlog::info(
"Hashing features");
71 spdlog::info(
"Applying data transformation");
77 spdlog::info(
"Normalizing instances.");
83 spdlog::info(
"Appending bias features with value {}",
Bias);
88 if(data->get_features()->is_sparse()) {
89 double total = data->num_features() * data->num_examples();
90 auto nnz = data->get_features()->sparse().nonZeros();
91 spdlog::info(
"Processed feature matrix has {} rows and {} columns. Contains {} non-zeros ({:.3} %)", data->num_examples(),
92 data->num_features(), nnz, 100.0 * (nnz / total));
94 spdlog::info(
"Processed feature matrix has {} rows and {} columns", data->num_examples(),
95 data->num_features());
std::shared_ptr< MultiLabelData > load(int verbose)
std::string DataSetFile
The file from which the dataset should be read.
bool augment_for_bias() const
void setup_data_args(CLI::App &app)
DatasetTransform TransformData
CLI::Option * AugmentForBias
Main namespace in which all types, classes, and functions are defined.
void normalize_instances(DatasetBase &data)
void augment_features_with_bias(DatasetBase &data, real_t bias=1)
void transform_features(DatasetBase &data, DatasetTransform transform)
void hash_sparse_features(SparseFeatures &features, unsigned seed, int buckets, int repeats)