DiSMEC++
transform.cpp
Go to the documentation of this file.
1 // Copyright (c) 2021, Aalto University, developed by Erik Schultheis
2 // All rights reserved.
3 //
4 // SPDX-License-Identifier: MIT
5 
6 #include "transform.h"
7 #include "data/data.h"
8 #include <random>
9 
10 using namespace dismec;
11 
12 namespace {
13  struct VisitorBias {
14  void operator()(SparseFeatures& features) const {
15  features = augment_features_with_bias(features, Bias);
16  }
17  void operator()(DenseFeatures& features) const {
18  features = augment_features_with_bias(features, Bias);
19  }
20 
22  };
23 }
24 
26  visit(VisitorBias{bias}, *data.edit_features());
27 }
28 
30  SparseFeatures new_sparse{features.rows(), features.cols() + 1};
31  new_sparse.reserve(features.nonZeros() + features.rows());
32  for (int k=0; k < features.outerSize(); ++k) {
33  new_sparse.startVec(k);
34  for (SparseFeatures::InnerIterator it(features, k); it; ++it)
35  {
36  new_sparse.insertBack(it.row(), it.col()) = it.value();
37  }
38  new_sparse.insertBack(k, features.cols()) = bias;
39  }
40  new_sparse.finalize();
41  return new_sparse;
42 }
43 
45  DenseFeatures new_features{features.rows(), features.cols() + 1};
46  new_features.leftCols(features.cols()) = features;
47  new_features.col(features.cols()).setConstant(bias);
48  // TODO add a unit test for this!
49  return new_features;
50 }
51 
53  return visit([](auto&& matrix){ return get_mean_feature(matrix); }, features);
54 }
55 
56 
58  DenseRealVector result(features.cols());
59  result.setZero();
60 
61  auto start = features.outerIndexPtr()[0];
62  auto end = features.outerIndexPtr()[features.rows()];
63 
64  const auto* indices = features.innerIndexPtr();
65  const auto* values = features.valuePtr();
66  for(auto index = start; index < end; ++index) {
67  auto col = indices[index];
68  result[col] += values[index];
69  }
70 
71  result /= features.rows();
72  return result;
73 }
74 
76  DenseRealVector result(features.cols());
77  result.setZero();
78 
79  for(int i = 0; i < features.rows(); ++i) {
80  result += features.row(i);
81  }
82 
83  result /= features.rows();
84  return result;
85 }
86 
87 
89  visit([](auto&& f){ normalize_instances(f); }, *data.edit_features());
90 }
91 
93  for(int i = 0; i < features.rows(); ++i) {
94  real_t norm = features.row(i).norm();
95  if(norm > 0) {
96  features.row(i) /= norm;
97  }
98  }
99 }
100 
102  for(int i = 0; i < features.rows(); ++i) {
103  real_t norm = features.row(i).norm();
104  if(norm > 0) {
105  features.row(i) /= norm;
106  }
107  }
108 }
109 
110 Eigen::PermutationMatrix<Eigen::Dynamic, Eigen::Dynamic, int> dismec::sort_features_by_frequency(DatasetBase& data) {
111  return visit([](auto&& f){ return sort_features_by_frequency(f); }, *data.edit_features());
112 }
113 
114 std::vector<long> dismec::count_features(const SparseFeatures& features) {
115  std::vector<long> counts(features.cols(), 0);
116  assert(features.isCompressed());
117 
118  // count the nonzero features
119  // the outer index is the row (instance index), the inner index is the feature id
120  const auto* last = features.innerIndexPtr() + features.nonZeros();
121  for(const auto* start = features.innerIndexPtr(); start != last; ++start) {
122  counts[*start] += 1;
123  }
124  return counts;
125 }
126 
127 Eigen::PermutationMatrix<Eigen::Dynamic, Eigen::Dynamic, int> dismec::sort_features_by_frequency(SparseFeatures& features) {
128  if(!features.isCompressed()) {
129  features.makeCompressed();
130  }
131  std::vector<long> counts = count_features(features);
132 
133  // do an argsort
134  types::DenseVector<int> reorder = types::DenseVector<int>::LinSpaced(features.cols(), 0, features.cols());
135  std::sort(reorder.begin(), reorder.end(), [&](int a, int b){
136  return counts[a] < counts[b];
137  });
138 
139  //create permutation Matrix with the size of the columns
140  Eigen::PermutationMatrix<Eigen::Dynamic, Eigen::Dynamic, int> permutation(reorder);
141 
142  features = features * permutation;
143  return permutation;
144 }
145 
146 Eigen::PermutationMatrix<Eigen::Dynamic, Eigen::Dynamic, int> dismec::sort_features_by_frequency(DenseFeatures& features) {
147  Eigen::PermutationMatrix<Eigen::Dynamic, Eigen::Dynamic, int> permutation(features.cols());
148  permutation.setIdentity();
149  return permutation;
150 }
151 
153  visit([&](auto&& f){ return transform_features(f, transform); }, *data.edit_features());
154 }
155 
156 namespace {
157  template<class T>
158  void transform_features_imp(T& features, DatasetTransform transform) {
159  switch(transform) {
161  break;
163  features = features.unaryExpr([](const real_t& value) { return std::log1p(value); });
164  break;
166  features = features.unaryExpr([](const real_t& value) { return real_t{1} + std::log(value); });
167  break;
169  features = features.unaryExpr([](const real_t& value) { return std::sqrt(value); });
170  break;
171  }
172  }
173 }
174 
176  transform_features_imp(features, transform);
177 }
178 
180  transform_features_imp(features, transform);
181 }
182 
183 void dismec::hash_sparse_features(SparseFeatures& features, unsigned seed, int buckets, int repeats) {
184  // First, count in how many instances each feature is used
185  if(!features.isCompressed()) {
186  features.makeCompressed();
187  }
188 
189  std::ranlux24 rng(seed);
190  std::uniform_int_distribution<int> mapping(0, buckets - 1);
191  Eigen::MatrixXi hash = Eigen::MatrixXi::NullaryExpr(features.cols(), repeats, [&](){
192  return mapping(rng);
193  });
194  DenseRealVector new_row(buckets * repeats);
195  SparseFeatures result(features.rows(), buckets * repeats);
196  for (int k=0; k < features.rows(); ++k) {
197  result.startVec(k);
198  new_row.setZero();
199  for (SparseFeatures::InnerIterator it(features, k); it; ++it)
200  {
201  for(int j = 0; j < repeats; ++j) {
202  new_row.coeffRef(hash.coeff(it.col(), j) + j * buckets) += it.value();
203  }
204  }
205 
206  // copy the dense vector into the row of the sparse matrix
207  for(int i = 0; i < new_row.size(); ++i) {
208  if(new_row.coeff(i) > 0) {
209  result.insertBack(k, i) = new_row.coeff(i);
210  }
211  }
212  }
213  result.finalize();
214 
215  // overwrite features
216  features = std::move(result);
217 }
218 
219 SparseFeatures dismec::shortlist_features(const SparseFeatures& source, const std::vector<long>& shortlist) {
220  SparseFeatures new_features(shortlist.size(), source.cols());
221  new_features.reserve(2 * source.nonZeros() * double(shortlist.size()) / double(source.rows()));
222  long new_row = 0;
223  for (auto row : shortlist) {
224  new_features.startVec(new_row);
225  for (SparseFeatures::InnerIterator it(source, row); it; ++it)
226  {
227  new_features.insertBack(new_row, it.col()) = it.value();
228  }
229  ++new_row;
230  }
231  new_features.finalize();
232  return new_features;
233 }
234 
235 DenseFeatures dismec::shortlist_features(const DenseFeatures& source, const std::vector<long>& shortlist) {
236  DenseFeatures new_features(shortlist.size(), source.cols());
237  long new_row = 0;
238  for (auto row : shortlist) {
239  new_features.row(new_row) = source.row(row);
240  ++new_row;
241  }
242  return new_features;
243 }
244 
245 
246 #include "doctest.h"
247 
248 TEST_CASE("augment sparse") {
249  SparseFeatures test(5, 5);
250  test.insert(3, 2) = 2.0;
251  test.insert(1, 3) = -1.0;
252  test.insert(0, 4) = 5.0;
253  test.insert(2, 2) = 2.0;
254  test.insert(2, 3) = 4.0;
255 
256  SparseFeatures extended = augment_features_with_bias(test, 1.0);
257 
258  // these checks are easier done using a dense matrix
259  DenseFeatures dense_test = test;
260  DenseFeatures dense_ext = extended;
261 
262  CHECK(dense_test.leftCols(Eigen::fix<5>) == dense_ext.leftCols(Eigen::fix<5>));
263  CHECK(dense_ext.col(Eigen::fix<5>) == DenseFeatures::Ones(5, 1));
264 }
265 
266 TEST_CASE("sort features") {
267  SparseFeatures test(5, 4);
268  test.insert(3, 2) = 2.0;
269  test.insert(1, 3) = -1.0;
270  test.insert(2, 2) = 2.0;
271  test.insert(1, 2) = 2.0;
272  test.insert(2, 3) = 4.0;
273  test.insert(2, 0) = -4.0;
274 
275  // column freqs: 0 - 1, 1: 0, 2: 3, 3: 2
276  // => reorder: 1, 0, 3, 2
277 
278  SparseFeatures expected(5, 4);
279  expected.insert(3, 3) = 2.0;
280  expected.insert(1, 2) = -1.0;
281  expected.insert(2, 3) = 2.0;
282  expected.insert(1, 3) = 2.0;
283  expected.insert(2, 2) = 4.0;
284  expected.insert(2, 1) = -4.0;
285 
287 
288  CHECK(test.toDense() == expected.toDense());
289 }
std::shared_ptr< GenericFeatureMatrix > edit_features()
get a shared pointer to mutable feature data. Use with care.
Definition: data.cpp:43
void transform_features_imp(T &features, DatasetTransform transform)
Definition: transform.cpp:158
outer_const< T, dense_vector_h > DenseVector
Definition: type_helpers.h:37
auto visit(F &&f, Variants &&... variants)
Definition: eigen_generic.h:95
Main namespace in which all types, classes, and functions are defined.
Definition: app.h:15
types::DenseRowMajor< real_t > DenseFeatures
Dense Feature Matrix in Row Major format.
Definition: matrix_types.h:58
void normalize_instances(DatasetBase &data)
Definition: transform.cpp:88
void augment_features_with_bias(DatasetBase &data, real_t bias=1)
Definition: transform.cpp:25
DatasetTransform
Definition: transform.h:37
void transform_features(DatasetBase &data, DatasetTransform transform)
Definition: transform.cpp:152
types::DenseVector< real_t > DenseRealVector
Any dense, real values vector.
Definition: matrix_types.h:40
std::vector< long > count_features(const SparseFeatures &features)
Definition: transform.cpp:114
types::SparseRowMajor< real_t > SparseFeatures
Sparse Feature Matrix in Row Major format.
Definition: matrix_types.h:50
void hash_sparse_features(SparseFeatures &features, unsigned seed, int buckets, int repeats)
Definition: transform.cpp:183
Eigen::PermutationMatrix< Eigen::Dynamic, Eigen::Dynamic, int > sort_features_by_frequency(DatasetBase &data)
Definition: transform.cpp:110
SparseFeatures shortlist_features(const SparseFeatures &source, const std::vector< long > &shortlist)
Definition: transform.cpp:219
DenseRealVector get_mean_feature(const GenericFeatureMatrix &features)
Definition: transform.cpp:52
float real_t
The default type for floating point values.
Definition: config.h:17
void operator()(DenseFeatures &features) const
Definition: transform.cpp:17
void operator()(SparseFeatures &features) const
Definition: transform.cpp:14
TEST_CASE("augment sparse")
Definition: transform.cpp:248