10 #include "spdlog/spdlog.h"
11 #include "spdlog/fmt/fmt.h"
12 #include "spdlog/stopwatch.h"
33 long NumExamples = -1;
34 long NumFeatures = -1;
39 THROW_ERROR(
"Error parsing dataset header: '{}'", content);
43 if(NumExamples <= 0) {
44 THROW_ERROR(
"Invalid number of examples {} in specified in header '{}'", NumExamples, content);
46 if(NumFeatures <= 0) {
47 THROW_ERROR(
"Invalid number of features {} in specified in header '{}'", NumFeatures, content);
50 THROW_ERROR(
"Invalid number of labels {} in specified in header '{}'", NumLabels, content);
56 THROW_ERROR(
"Found additional text '{}' in header '{}'", rest, content);
59 return {NumExamples, NumFeatures, NumLabels};
77 std::string line_buffer;
78 std::vector<long> features_per_example;
79 features_per_example.reserve(num_examples);
83 while (std::getline(source, line_buffer))
86 if (line_buffer.empty())
88 if(line_buffer.front() ==
'#')
93 long num_ftr = std::count(begin(line_buffer), end(line_buffer),
':');
94 features_per_example.push_back(num_ftr);
97 return features_per_example;
115 const char *last = line;
116 if (!std::isspace(*line)) {
120 const char *result =
nullptr;
125 THROW_ERROR(
"Error parsing label. Expected a number.");
126 }
else if(errno != 0) {
127 THROW_ERROR(
"Error parsing label. Errno={}: '{}'", errno, strerror(errno));
129 if (*result ==
',') {
131 }
else if (std::isspace(*result) != 0 || *result ==
'\0') {
137 THROW_ERROR(
"Error parsing label. Expected ',', got '{}', '{}'", errno, *result ? *result :
'0', line);
163 template<
long IndexOffset>
166 std::vector<std::vector<long>>& label_buffer)
168 std::string line_buffer;
169 auto num_labels =
ssize(label_buffer);
170 auto num_features = feature_buffer.cols();
171 auto num_examples = feature_buffer.rows();
174 while (std::getline(source, line_buffer)) {
175 if (line_buffer.empty())
177 if (line_buffer.front() ==
'#')
180 if(example >= num_examples) {
181 THROW_ERROR(
"Encountered example number index {:5} but buffers only expect {:5} examples.", example, num_examples);
185 auto label_end =
parse_labels(line_buffer.data(), [&](
long lbl) {
186 long adjusted_label = lbl - IndexOffset;
187 if (adjusted_label >= num_labels || adjusted_label < 0) {
188 THROW_ERROR(
"Encountered label {:5}, but number of labels "
189 "was specified as {}.", lbl, num_labels);
191 label_buffer[adjusted_label].push_back(example);
195 long adjusted_index = index - IndexOffset;
196 if (adjusted_index >= num_features || adjusted_index < 0) {
197 THROW_ERROR(
"Encountered feature index {:5} with value {}. Number of features "
198 "was specified as {}.", index, value, num_features);
202 if(std::isnan(value)) {
203 THROW_ERROR(
"Encountered feature index {:5} with value {}.", index, value);
205 feature_buffer.insert(example, adjusted_index) =
static_cast<real_t>(value);
208 }
catch (std::runtime_error& e) {
209 THROW_ERROR(
"Error reading example {}: {}.", example + 1, e.what());
217 std::fstream source(source_path, std::fstream::in);
218 if (!source.is_open()) {
219 throw std::runtime_error(fmt::format(
"Cannot open input file {}", source_path.c_str()));
227 std::string line_buffer;
228 spdlog::stopwatch timer;
230 std::getline(source, line_buffer);
233 spdlog::info(
"Loading dataset '{}' with {} examples, {} features and {} labels.",
234 name, header.NumExamples, header.NumFeatures, header.NumLabels);
237 if (
ssize(features_per_example) != header.NumExamples) {
238 THROW_EXCEPTION(std::runtime_error,
"Dataset '{}' declared {} examples, but {} where found!",
239 name, header.NumExamples, features_per_example.size());
248 x.reserve(features_per_example);
250 std::vector<std::vector<long>> label_data;
251 label_data.resize(header.NumLabels);
255 std::getline(source, line_buffer);
257 if(mode == IndexMode::ZERO_BASED) {
258 read_into_buffers<0>(source, x, label_data);
260 read_into_buffers<1>(source, x, label_data);
266 for (
auto& instance_list : label_data) {
267 instance_list.shrink_to_fit();
270 spdlog::info(
"Finished loading dataset '{}' in {:.3}s.", name, timer);
272 return {x.markAsRValue(), std::move(label_data)};
283 auto all_but_one =
ssize(labels) - 1;
284 for(
int i = 0; i < all_but_one; ++i) {
285 stream << labels[i] <<
',';
288 stream << labels.back();
300 std::vector<std::vector<int>> all_labels(data.
num_examples());
303 all_labels[instance].push_back(label.to_index());
308 throw std::runtime_error(fmt::format(
"XMC format requires sparse labels"));
310 const auto& feature_ptr = data.
get_features()->sparse();
312 for(
int example = 0; example < data.
num_examples(); ++example) {
316 for (SparseFeatures::InnerIterator it(feature_ptr, example); it; ++it) {
317 target <<
' ' << it.col() <<
':' << it.value();
324 std::fstream target(target_path, std::fstream::out);
325 if (!target.is_open()) {
326 throw std::runtime_error(fmt::format(
"Cannot open output file {}", target_path.c_str()));
329 target.setf(std::fstream::fmtflags::_S_fixed, std::fstream::floatfield);
342 SUBCASE(
"trailing space") {
345 SUBCASE(
"tab separated") {
346 input =
"12\t54 \t 43 ";
349 CHECK(valid.NumExamples == 12);
350 CHECK(valid.NumFeatures == 54);
351 CHECK(valid.NumLabels == 43);
377 auto do_test = [](
const std::string& source) {
378 std::stringstream sstr(source);
380 REQUIRE(count.size() == 3);
381 CHECK(count[0] == 2);
382 CHECK(count[1] == 1);
383 CHECK(count[2] == 4);
387 std::string source = R
"(12 5:5.3 6:34
389 1 3:4 5:1 10:43 5:3)";
394 std::string source = R
"(12 5:5.3 6:34
397 1 3:4 5:1 10:43 5:3)";
401 SUBCASE("empty line") {
402 std::string source = R
"(12 5:5.3 6:34
405 1 3:4 5:1 10:43 5:3)";
423 CHECK_THROWS(
parse_labels(
"5,1, 5:2.0", [&](
long v) {}));
427 CHECK_THROWS(
parse_labels(
"5.5,1 10:3.0", [&](
long v) {}));
429 CHECK_THROWS(
parse_labels(
"5;1 10:3.0", [&](
long v) {}));
444 auto run_test = [&](std::string source,
const std::vector<long>& expect){
449 CHECK(expect.at(pos) == v);
452 }
catch (std::runtime_error& err) {
453 FAIL(
"parsing failed");
455 CHECK(expect.size() == pos);
458 SUBCASE(
"simple valid line") {
459 run_test(
"1,3,4 12:4", {1, 3, 4});
461 SUBCASE(
"with space") {
462 run_test(
"1, 3,\t4 12:4", {1, 3, 4});
464 SUBCASE(
"leading +") {
465 run_test(
"+1, 3,\t4 12:4", {1, 3, 4});
467 SUBCASE(
"separated by space") {
468 run_test(
"1,3,4\t12:4", {1, 3, 4});
470 SUBCASE(
"empty labels space") {
471 run_test(
" 12:4", {});
473 SUBCASE(
"empty labels tab") {
474 run_test(
"\t12:4", {});
476 SUBCASE(
"missing features") {
477 run_test(
"5, 1", {5, 1});
489 auto x = std::make_shared<SparseFeatures>(2, 3);
491 std::vector<std::vector<long>> labels;
493 std::stringstream source;
495 SUBCASE(
"invalid feature") {
496 source.str(
"1 2:0.5 3:0.5");
497 SUBCASE(
"zero-base") {
498 CHECK_THROWS(read_into_buffers<0>(source, *x, labels));
500 SUBCASE(
"one-base") {
501 CHECK_NOTHROW(read_into_buffers<1>(source, *x, labels));
505 SUBCASE(
"negative feature") {
506 source.str(
"1 -1:0.5 1:0.5");
507 SUBCASE(
"zero-base") {
508 CHECK_THROWS(read_into_buffers<0>(source, *x, labels));
510 SUBCASE(
"one-base") {
511 CHECK_THROWS(read_into_buffers<1>(source, *x, labels));
515 SUBCASE(
"invalid label") {
516 source.str(
"2 2:0.5");
517 SUBCASE(
"zero-base") {
518 CHECK_THROWS(read_into_buffers<0>(source, *x, labels));
520 SUBCASE(
"one-base") {
521 CHECK_NOTHROW(read_into_buffers<1>(source, *x, labels));
525 SUBCASE(
"negative label") {
526 source.str(
"-1 2:0.5");
527 SUBCASE(
"zero-base") {
528 CHECK_THROWS(read_into_buffers<0>(source, *x, labels));
530 SUBCASE(
"one-base") {
531 CHECK_THROWS(read_into_buffers<1>(source, *x, labels));
535 SUBCASE(
"invalid example") {
536 source.str(
"0 0:0.5\n0 0:0.5\n0 0:0.5");
537 SUBCASE(
"zero-base") {
538 CHECK_THROWS(read_into_buffers<0>(source, *x, labels));
540 SUBCASE(
"one-base") {
541 CHECK_THROWS(read_into_buffers<1>(source, *x, labels));
545 SUBCASE(
"invalid zero label in one-based indexing") {
546 source.str(
"0 2:0.5 2:0.5");
547 SUBCASE(
"zero-base") {
548 CHECK_NOTHROW(read_into_buffers<0>(source, *x, labels));
550 SUBCASE(
"one-base") {
551 CHECK_THROWS(read_into_buffers<1>(source, *x, labels));
555 SUBCASE(
"invalid zero feature in one-based indexing") {
556 source.str(
"1 0:0.5 2:0.5");
557 SUBCASE(
"zero-base") {
558 CHECK_NOTHROW(read_into_buffers<0>(source, *x, labels));
560 SUBCASE(
"one-base") {
561 CHECK_THROWS(read_into_buffers<1>(source, *x, labels));
long num_examples() const noexcept
Get the total number of instances, i.e. the number of rows in the feature matrix.
std::shared_ptr< const GenericFeatureMatrix > get_features() const
get a shared pointer to the (immutable) feature data
long num_features() const noexcept
Get the total number of features, i.e. the number of columns in the feature matrix.
long num_labels() const noexcept override
const std::vector< long > & get_label_instances(label_id_t label) const
Strong typedef for an int to signify a label id.
constexpr T to_index() const
! Explicitly convert to an integer.
building blocks for io procedures that are used by multiple io subsystems
XMCHeader parse_xmc_header(const std::string &content)
Parses the header (number of examples, features, labels) of an XMC dataset file.
std::ostream & write_label_list(std::ostream &stream, const std::vector< int > &labels)
std::vector< long > count_features_per_example(std::istream &source, std::size_t num_examples=100 '000)
Extracts number of nonzero features for each instance.
const char * parse_labels(const char *line, F &&callback)
parses the labels part of a xmc dataset line.
void read_into_buffers(std::istream &source, SparseFeatures &feature_buffer, std::vector< std::vector< long >> &label_buffer)
iterates over the lines in source and puts the corresponding features and labels into the given buffe...
constexpr double precision(const ConfusionMatrixBase< T > &matrix)
MultiLabelData read_xmc_dataset(const std::filesystem::path &source, IndexMode mode=IndexMode::ZERO_BASED)
Reads a dataset given in the extreme multilabel classification format.
MatrixHeader parse_header(const std::string &content)
void save_xmc_dataset(std::ostream &target, const MultiLabelData &data)
Saves the given dataset in XMC format.
IndexMode
Enum to decide whether indices in an xmc file are starting from 0 or from 1.
long parse_long(const char *string, const char **out)
void parse_sparse_vector_from_text(const char *feature_part, F &&callback)
parses sparse features given in index:value text format.
Main namespace in which all types, classes, and functions are defined.
constexpr auto ssize(const C &c) -> std::common_type_t< std::ptrdiff_t, std::make_signed_t< decltype(c.size())>>
signed size free function. Taken from https://en.cppreference.com/w/cpp/iterator/size
types::SparseRowMajor< real_t > SparseFeatures
Sparse Feature Matrix in Row Major format.
float real_t
The default type for floating point values.
#define THROW_EXCEPTION(exception_type,...)
TEST_CASE("parse valid header")