DiSMEC++
common.cpp
Go to the documentation of this file.
1 // Copyright (c) 2021, Aalto University, developed by Erik Schultheis
2 // All rights reserved.
3 //
4 // SPDX-License-Identifier: MIT
5 
6 #include "common.h"
7 
8 using namespace dismec;
9 
10 std::string io::detail::print_char(char c) {
11  std::string result;
12  if(std::isprint(c) != 0) {
13  result.push_back(c);
14  return result;
15  }
16  result.push_back('\\');
17  result.append(std::to_string((int)c));
18  return result;
19 }
20 
21 std::ostream& io::write_vector_as_text(std::ostream& stream, const Eigen::Ref<const DenseRealVector>& data)
22 {
23  if(data.size() == 0) {
24  return stream;
25  }
26 
27  // size is > 0, so -1 is safe
28  for(int i = 0; i < data.size() - 1; ++i) {
29  stream << data.coeff(i) << ' ';
30  }
31  // no trailing space
32  stream << data.coeff(data.size() - 1);
33 
34  return stream;
35 }
36 
37 std::istream& io::read_vector_from_text(std::istream& stream, Eigen::Ref<DenseRealVector> data) {
38  for (int j = 0; j < data.size(); ++j) {
39  stream >> data.coeffRef(j);
40  }
41 
42  if (stream.bad()) {
43  THROW_ERROR("Error while reading a {} element dense vector from text data", data.size());
44  }
45 
46  return stream;
47 }
48 
49 io::MatrixHeader io::parse_header(const std::string& content) {
50  std::stringstream parse_header{content};
51  long NumRows = -1;
52  long NumCols = -1;
53 
54  parse_header >> NumRows >> NumCols;
55  if (parse_header.fail()) {
56  THROW_ERROR("Error parsing header: '{}'", content);
57  }
58 
59  // check validity of numbers
60  if(NumRows <= 0) {
61  THROW_ERROR("Invalid number of rows {} specified in header '{}'", NumRows, content);
62  }
63  if(NumCols <= 0) {
64  THROW_ERROR("Invalid number of rows {} specified in header '{}'", NumCols, content);
65  }
66 
67  std::string rest;
68  parse_header >> rest;
69  if(!rest.empty()) {
70  THROW_ERROR("Found additional text '{}' in header '{}'", rest, content);
71  }
72 
73  return {NumRows, NumCols};
74 }
75 
77  // for now, labels are assumed to come from a text file
78  std::string line_buffer;
79  std::getline(source, line_buffer);
80  auto header = parse_header(line_buffer);
81 
82  std::vector<std::vector<long>> label_data;
83  label_data.resize(header.NumCols);
84 
85  long example = 0;
86  long num_rows = header.NumRows;
87  long num_cols = header.NumCols;
88 
89  while (std::getline(source, line_buffer)) {
90  if (line_buffer.empty())
91  continue;
92  if (line_buffer.front() == '#')
93  continue;
94 
95  if(example >= num_rows) {
96  THROW_ERROR("Encountered row {:5} but only expected {:5} rows.", example, num_rows);
97  }
98 
99  try {
100  io::parse_sparse_vector_from_text(line_buffer.c_str(), [&](long index, double value) {
101  long adjusted_index = index;
102  if (adjusted_index >= num_cols || adjusted_index < 0) {
103  THROW_ERROR("Encountered index {:5}. Number of columns "
104  "was specified as {}.", index, num_cols);
105  }
106  // filter out explicit zeros
107  if (value != 1) {
108  THROW_ERROR("Encountered value {} at index {}.", value, index);
109  }
110  label_data[adjusted_index].push_back(example);
111  });
112  } catch (std::runtime_error& e) {
113  THROW_ERROR("Error reading example {}: {}.", example + 1, e.what());
114  }
115  ++example;
116  }
117  return {num_rows, num_cols, std::move(label_data)};
118 }
building blocks for io procedures that are used by multiple io subsystems
#define THROW_ERROR(...)
Definition: common.h:23
std::string print_char(char c)
Definition: common.cpp:10
const char * to_string(WeightFormat format)
Definition: model-io.cpp:89
std::ostream & write_vector_as_text(std::ostream &stream, const Eigen::Ref< const DenseRealVector > &data)
Writes the given vector as space-separated human-readable numbers.
Definition: common.cpp:21
std::istream & read_vector_from_text(std::istream &stream, Eigen::Ref< DenseRealVector > data)
Reads the given vector as space-separated human-readable numbers.
Definition: common.cpp:37
MatrixHeader parse_header(const std::string &content)
Definition: common.cpp:49
LoLBinarySparse read_binary_matrix_as_lol(std::istream &source)
Definition: common.cpp:76
void parse_sparse_vector_from_text(const char *feature_part, F &&callback)
parses sparse features given in index:value text format.
Definition: common.h:52
Main namespace in which all types, classes, and functions are defined.
Definition: app.h:15
Binary Sparse Matrix in List-of-Lists format.
Definition: common.h:143
Collects the rows and columns parsed from a plain-text matrix file.
Definition: common.h:130