DiSMEC++
generic_linear.cpp
Go to the documentation of this file.
1 // Copyright (c) 2021, Aalto University, developed by Erik Schultheis
2 // All rights reserved.
3 //
4 // SPDX-License-Identifier: MIT
5 
6 #include "generic_linear.h"
7 #include "utils/eigen_generic.h"
8 #include "utils/throw_error.h"
9 #include "stats/collection.h"
10 #include "margin_losses.h"
11 
12 using namespace dismec;
14 
15 namespace {
17  constexpr const stat_id_t STAT_GRAD_SPARSITY{8};
18 }
19 
20 real_t GenericLinearClassifier::value_unchecked(const HashVector& location) {
21  const DenseRealVector& xTw = x_times_w(location);
22  return value_from_xTw(xTw) + m_Regularizer->value(location);
23 }
24 
25 real_t GenericLinearClassifier::lookup_on_line(real_t position) {
26  m_GenericInBuffer = line_interpolation(position);
27  real_t f = value_from_xTw(m_GenericInBuffer);
28  return f + m_Regularizer->lookup_on_line(position);
29 }
30 
32 {
33  calculate_loss(xTw, labels(), m_GenericOutBuffer);
34  return m_GenericOutBuffer.dot(costs());
35 }
36 
37 void
38 GenericLinearClassifier::hessian_times_direction_unchecked(const HashVector& location, const DenseRealVector& direction,
39  Eigen::Ref<DenseRealVector> target) {
40  m_Regularizer->hessian_times_direction(location, direction, target);
41 
42  const auto& hessian = cached_2nd_derivative(location);
43  visit([&](const auto& features) {
44  for (int pos = 0; pos < hessian.size(); ++pos) {
45  if(real_t h = hessian.coeff(pos); h != 0) {
46  real_t factor = features.row(pos).dot(direction);
47  target += features.row(pos) * factor * h;
48  }
49  }
50  }, generic_features());
51 }
52 
53 void GenericLinearClassifier::gradient_and_pre_conditioner_unchecked(const HashVector& location,
54  Eigen::Ref<DenseRealVector> gradient,
55  Eigen::Ref<DenseRealVector> pre) {
56  m_Regularizer->gradient(location, gradient);
57  m_Regularizer->diag_preconditioner(location, pre);
58 
59  const auto& derivative = cached_derivative(location);
60  const auto& hessian = cached_2nd_derivative(location);
61  visit([&](const auto& features) {
62  for (int pos = 0; pos < derivative.size(); ++pos) {
63  if(real_t d = derivative.coeff(pos); d != 0) {
64  gradient += features.row(pos) * d;
65  }
66  if(real_t h = hessian.coeff(pos); h != 0) {
67  pre += features.row(pos).cwiseAbs2() * h;
68  }
69  }
70  }, generic_features());
71 
72 }
73 
74 void GenericLinearClassifier::gradient_unchecked(const HashVector& location, Eigen::Ref<DenseRealVector> target) {
75  m_Regularizer->gradient(location, target);
76 
77  const auto& derivative = cached_derivative(location);
78  visit([&](const auto& features) {
79  for (int pos = 0; pos < derivative.size(); ++pos) {
80  if(real_t d = derivative.coeff(pos); d != 0) {
81  target += features.row(pos) * d;
82  }
83  }
84  }, generic_features());
85 }
86 
87 void GenericLinearClassifier::gradient_at_zero_unchecked(Eigen::Ref<DenseRealVector> target) {
88  m_Regularizer->gradient_at_zero(target);
89 
90  m_GenericInBuffer = DenseRealVector::Zero(labels().size());
91  calculate_derivative(m_GenericInBuffer, labels(), m_GenericOutBuffer);
92  const auto& cost_vector = costs();
93  visit([&](const auto& features) {
94  for (int pos = 0; pos < m_GenericOutBuffer.size(); ++pos) {
95  if(real_t d = m_GenericOutBuffer.coeff(pos); d != 0) {
96  target += features.row(pos) * (cost_vector.coeff(pos) * d);
97  }
98  }
99  }, generic_features());
100 }
101 
102 void GenericLinearClassifier::diag_preconditioner_unchecked(const HashVector& location, Eigen::Ref<DenseRealVector> target) {
103  m_Regularizer->diag_preconditioner(location, target);
104 
105  const auto& hessian = cached_2nd_derivative(location);
106  visit([&](const auto& features) {
107  for (int pos = 0; pos < hessian.size(); ++pos) {
108  if(real_t h = hessian.coeff(pos); h != 0) {
109  target += features.row(pos).cwiseAbs2() * h;
110  }
111  }
112  }, generic_features());
113 }
114 
115 const DenseRealVector& GenericLinearClassifier::cached_derivative(const HashVector& location) {
116  return m_DerivativeBuffer.update(location, [&](const DenseRealVector& input, DenseRealVector& out){
117  calculate_derivative(x_times_w(location), labels(), out);
118  record(STAT_GRAD_SPARSITY, [&](){
119  long nnz = 0;
120  for(int i = 0; i < out.size(); ++i) {
121  if(out.coeff(i) != 0) ++nnz;
122  }
123  return static_cast<real_t>(static_cast<double>(100*nnz) / out.size()); });
124  out.array() *= costs().array();
125  });
126 }
127 
128 const DenseRealVector& GenericLinearClassifier::cached_2nd_derivative(const HashVector& location) {
129  return m_SecondDerivativeBuffer.update(location, [&](const DenseRealVector& input, DenseRealVector& out){
130  calculate_2nd_derivative(x_times_w(location), labels(), out);
131  out.array() *= costs().array();
132  });
133 }
134 
135 void GenericLinearClassifier::invalidate_labels() {
136  m_DerivativeBuffer.invalidate();
137  m_SecondDerivativeBuffer.invalidate();
138 }
139 
140 GenericLinearClassifier::GenericLinearClassifier(std::shared_ptr<const GenericFeatureMatrix> X,
141  std::unique_ptr<Objective> regularizer)
142  : LinearClassifierBase(std::move(X)),
143  m_SecondDerivativeBuffer(num_instances()),
144  m_DerivativeBuffer(num_instances()), m_GenericInBuffer(num_instances()),
145  m_GenericOutBuffer(num_instances()), m_Regularizer(std::move(regularizer))
146  {
147  declare_stat(STAT_GRAD_SPARSITY, {"gradient_sparsity", "% non-zeros"});
148  if(!m_Regularizer) {
149  THROW_EXCEPTION(std::invalid_argument, "Regularizer cannot be nullptr");
150  }
151 }
152 
154  project_linear_to_line(location, direction);
155  m_Regularizer->project_to_line(location, direction);
156 }
157 
158 
159 // ---------------------------------------------------------------------------------------------------------------------
160 // Some concrete implementations of common loss functions
161 // ---------------------------------------------------------------------------------------------------------------------
162 
163 namespace objective = dismec::objective;
164 
165 namespace {
166  template<class Phi, class... Args>
167  std::unique_ptr<GenericLinearClassifier> make_gen_lin_classifier(std::shared_ptr<const GenericFeatureMatrix> X,
168  std::unique_ptr<objective::Objective> regularizer,
169  Args... args) {
170  return std::make_unique<objective::GenericMarginClassifier<Phi>>(std::move(X), std::move(regularizer),
171  Phi{std::forward<Args>(args)...});
172  }
173 }
174 
175 std::unique_ptr<GenericLinearClassifier> objective::make_squared_hinge(std::shared_ptr<const GenericFeatureMatrix> X,
176  std::unique_ptr<Objective> regularizer) {
177  return make_gen_lin_classifier<SquaredHingePhi>(std::move(X), std::move(regularizer));
178 }
179 
180 std::unique_ptr<GenericLinearClassifier> objective::make_logistic_loss(std::shared_ptr<const GenericFeatureMatrix> X,
181  std::unique_ptr<Objective> regularizer) {
182  return make_gen_lin_classifier<LogisticPhi>(std::move(X), std::move(regularizer));
183 }
184 
185 std::unique_ptr<GenericLinearClassifier> objective::make_huber_hinge(std::shared_ptr<const GenericFeatureMatrix> X,
186  std::unique_ptr<Objective> regularizer,
187  real_t epsilon) {
188  return make_gen_lin_classifier<HuberPhi>(std::move(X), std::move(regularizer), epsilon);
189 }
190 
191 #ifndef DOCTEST_CONFIG_DISABLE
192 #include "doctest.h"
193 #include "regularizers_imp.h"
194 #include "reg_sq_hinge.h"
195 
196 using namespace dismec;
197 
198 namespace {
200  auto test_vector_equal = [](auto&& u, auto&& v, const char* message){
201  REQUIRE(u.size() == v.size());
202  for(int i = 0; i < u.size(); ++i) {
203  REQUIRE_MESSAGE(u.coeff(i) == doctest::Approx(v.coeff(i)), message);
204  }
205  };
206  DenseRealVector buffer_a(input->size());
207  DenseRealVector buffer_b(input->size());
208  CHECK_MESSAGE(a.value(input) == doctest::Approx(b.value(input)), "values differ");
209 
210  a.gradient_at_zero(buffer_a);
211  b.gradient_at_zero(buffer_b);
212  test_vector_equal(buffer_a, buffer_b, "gradient@0 mismatch");
213 
214  a.gradient(input, buffer_a);
215  b.gradient(input, buffer_b);
216  test_vector_equal(buffer_a, buffer_b, "gradient mismatch");
217 
218  a.diag_preconditioner(input, buffer_a);
219  b.diag_preconditioner(input, buffer_b);
220  test_vector_equal(buffer_a, buffer_b, "pre-conditioner mismatch");
221 
222  DenseRealVector direction = DenseRealVector::Random(input->size());
223  a.hessian_times_direction(input, direction, buffer_a);
224  b.hessian_times_direction(input, direction, buffer_b);
225  test_vector_equal(buffer_a, buffer_b, "hessian mismatch");
226 
227  DenseRealVector buffer_a2(input->size());
228  DenseRealVector buffer_b2(input->size());
229  a.gradient_and_pre_conditioner(input, buffer_a, buffer_a2);
230  b.gradient_and_pre_conditioner(input, buffer_b, buffer_b2);
231  test_vector_equal(buffer_a, buffer_b, "gradient mismatch");
232  test_vector_equal(buffer_a2, buffer_b2, "pre-conditioner mismatch");
233  }
234 }
235 
236 TEST_CASE("sparse/dense equivalence") {
237  int rows, cols;
238  real_t pos_cost = 1, neg_cost = 1;
239 
240  auto run_test = [&](){
241  DenseFeatures features_dense = DenseFeatures::Random(rows, cols);
242  SparseFeatures features_sparse = features_dense.sparseView();
243 
244  Eigen::Matrix<std::int8_t, Eigen::Dynamic, 1> labels = Eigen::Matrix<std::int8_t, Eigen::Dynamic, 1>::Random(rows);
245  for(int i = 0; i < labels.size(); ++i) {
246  if(labels.coeff(i) > 0) {
247  labels.coeffRef(i) = 1;
248  } else {
249  labels.coeffRef(i) = -1;
250  }
251  }
252 
253 
254  auto reg_dense = make_squared_hinge(std::make_shared<GenericFeatureMatrix>(features_dense),
255  std::make_unique<objective::SquaredNormRegularizer>());
256  auto reg_sparse = make_squared_hinge(std::make_shared<GenericFeatureMatrix>(features_sparse),
257  std::make_unique<objective::SquaredNormRegularizer>());
258 
259  auto reference = objective::Regularized_SquaredHingeSVC(std::make_shared<GenericFeatureMatrix>(features_sparse),
260  std::make_unique<objective::SquaredNormRegularizer>());
261 
262  auto do_test = [&](auto& first, auto& second) {
263  first.get_label_ref() = labels;
264  second.get_label_ref() = labels;
265 
266  first.update_costs(pos_cost, neg_cost);
267  second.update_costs(pos_cost, neg_cost);
268 
269  DenseRealVector weights = DenseRealVector::Random(cols);
270  test_equivalence(first, second, HashVector(weights));
271  };
272 
273  do_test(*reg_dense, *reg_sparse);
274  do_test(reference, *reg_sparse);
275  };
276 
277  SUBCASE("rows > cols") {
278  rows = 20;
279  cols = 10;
280  run_test();
281  }
282  SUBCASE("cols > rows") {
283  rows = 10;
284  cols = 20;
285  run_test();
286  }
287 
288  SUBCASE("pos weighted") {
289  rows = 15;
290  cols = 32;
291  pos_cost = 2.0;
292  run_test();
293  }
294 
295  SUBCASE("neg weighted") {
296  rows = 15;
297  cols = 32;
298  neg_cost = 2.0;
299  run_test();
300  }
301  /*
302  SUBCASE("large") {
303  rows = 1500;
304  cols = 3200;
305  // this fails, which indicates different numerical stability
306  run_test();
307  }
308  */
309 }
310 
311 
312 TEST_CASE("generic squared hinge") {
313  SparseFeatures x(3, 5);
314  x.insert(0, 3) = 1.0;
315  x.insert(1, 0) = 2.0;
316  x.insert(2, 1) = 1.0;
317  x.insert(2, 2) = 1.0;
318 
319  Eigen::Matrix<std::int8_t, Eigen::Dynamic, 1> y(3);
320  y << -1, 1, -1;
321 
322 
323  auto loss = make_squared_hinge(std::make_shared<GenericFeatureMatrix>(DenseFeatures (x)),
324  std::make_unique<objective::SquaredNormRegularizer>());
325  loss->get_label_ref() = y;
326 
327  auto reference = objective::Regularized_SquaredHingeSVC(std::make_shared<GenericFeatureMatrix>(x),
328  std::make_unique<objective::SquaredNormRegularizer>());
329  reference.get_label_ref() = y;
330 
331  DenseRealVector weights(5);
332  weights << 1.0, 2.0, 0.0, -1.0, 2.0;
333 
334  auto do_check = [&](real_t factor){
335  // z = (-1, 2, 2)
336  // 1 - yz = 0, -1, 3
337  CHECK_MESSAGE(loss->value(HashVector{weights}) == doctest::Approx(factor * 9.0 + 5), "wrong value");
338 
339  //
340  DenseRealVector grad(5);
341  loss->gradient(HashVector{weights}, grad);
342  // dl/dz = 0, 0, 2*3
343  // dl/dx = 6*(0.0, 1.0, 1.0, 0.0, 0.0) + 0.5*weights
344  DenseRealVector i(5);
345  i << 0.0, 1.0, 1.0, 0.0, 0.0;
346  DenseRealVector r = factor * 6 * i + weights;
347  CHECK_MESSAGE(grad == r, "wrong gradient");
348 
349  // also check numerically
350  real_t old_val = loss->value(HashVector{weights});
351  DenseRealVector nw = weights + grad * 1e-4;
352  real_t new_val = loss->value(HashVector{nw});
353  CHECK (new_val - old_val == doctest::Approx(grad.squaredNorm() * 1e-4).epsilon(1e-4));
354 
355  // preconditioner == diagonal of Hessian
356  DenseRealVector prec_new(5);
357  DenseRealVector prec_old(5);
358  loss->diag_preconditioner(HashVector{weights}, prec_new);
359  reference.diag_preconditioner(HashVector{weights}, prec_old);
360  CHECK_MESSAGE(prec_new == prec_old, "wrong preconditioner");
361 
362  loss->hessian_times_direction(HashVector{weights}, i, prec_new);
363  reference.hessian_times_direction(HashVector{weights}, i, prec_old);
364  CHECK_MESSAGE(prec_new == prec_old, "wrong hessian");
365 
366  // g@0
367  loss->gradient_at_zero(prec_new);
368  reference.gradient_at_zero(prec_old);
369  CHECK_MESSAGE(prec_new == prec_old, "g@0 wrong");
370  };
371 
372 
373  // since the positive example is correct with margin,
374  // re-weighting positives does not change the outcome,
375  // whereas negatives change the result by a constant factor
376  SUBCASE("unweighted") {
377  do_check(1.0);
378  }
379  SUBCASE("positive-reweighted") {
380  loss->update_costs(2.0, 1.0);
381  reference.update_costs(2.0, 1.0);
382  do_check(1.0);
383  }
384  SUBCASE("negative-reweighted") {
385  loss->update_costs(1.0, 2.0);
386  reference.update_costs(1.0, 2.0);
387  do_check(2.0);
388  }
389 }
390 
391 #endif
An Eigen vector with versioning information, to implement simple caching of results.
Definition: hash_vector.h:43
This is a non-templated, runtime-polymorphic generic implementation of the linear classifier objectiv...
std::unique_ptr< Objective > m_Regularizer
Pointer to the regularizer.
void project_to_line_unchecked(const HashVector &location, const DenseRealVector &direction) override
Base class for objectives that use a linear classifier.
Definition: linear.h:27
void project_linear_to_line(const HashVector &location, const DenseRealVector &direction)
Prepares the cache variables for line projection.
Definition: linear.cpp:63
Class that models an optimization objective.
Definition: objective.h:41
void hessian_times_direction(const HashVector &location, const DenseRealVector &direction, Eigen::Ref< DenseRealVector > target)
Calculates the product of the Hessian matrix at location with direction.
Definition: objective.cpp:107
void gradient_at_zero(Eigen::Ref< DenseRealVector > target)
Gets the gradient for location zero.
Definition: objective.cpp:82
void gradient(const HashVector &location, Eigen::Ref< DenseRealVector > target)
Evaluate the gradient at location.
Definition: objective.cpp:96
void gradient_and_pre_conditioner(const HashVector &location, Eigen::Ref< DenseRealVector > gradient, Eigen::Ref< DenseRealVector > pre)
Combines the calculation of gradient and pre-conditioner, which may be more efficient in some cases.
Definition: objective.cpp:58
void diag_preconditioner(const HashVector &location, Eigen::Ref< DenseRealVector > target)
Get precondition to be used in CG optimization.
Definition: objective.cpp:43
real_t value(const HashVector &location)
Evaluate the objective at the given location.
Definition: objective.cpp:35
void declare_stat(stat_id_t index, StatisticMetaData meta)
Declares a new statistics. This function just forwards all its arguments to the internal StatisticsCo...
Definition: tracked.cpp:16
TEST_CASE("sparse/dense equivalence")
std::unique_ptr< GenericLinearClassifier > make_gen_lin_classifier(std::shared_ptr< const GenericFeatureMatrix > X, std::unique_ptr< objective::Objective > regularizer, Args... args)
void test_equivalence(objective::Objective &a, objective::Objective &b, const HashVector &input)
constexpr const stat_id_t STAT_GRAD_SPARSITY
real_t value_from_xTw(const DenseRealVector &cost, const BinaryLabelVector &labels, const Eigen::DenseBase< Derived > &xTw)
std::unique_ptr< GenericLinearClassifier > make_huber_hinge(std::shared_ptr< const GenericFeatureMatrix > X, std::unique_ptr< Objective > regularizer, real_t epsilon)
std::unique_ptr< GenericLinearClassifier > make_logistic_loss(std::shared_ptr< const GenericFeatureMatrix > X, std::unique_ptr< Objective > regularizer)
std::unique_ptr< GenericLinearClassifier > make_squared_hinge(std::shared_ptr< const GenericFeatureMatrix > X, std::unique_ptr< Objective > regularizer)
opaque_int_type< detail::stat_id_tag > stat_id_t
An opaque int-like type that is used to identify a statistic in a StatisticsCollection.
Definition: stat_id.h:24
auto visit(F &&f, Variants &&... variants)
Definition: eigen_generic.h:95
Main namespace in which all types, classes, and functions are defined.
Definition: app.h:15
types::DenseRowMajor< real_t > DenseFeatures
Dense Feature Matrix in Row Major format.
Definition: matrix_types.h:58
types::DenseVector< real_t > DenseRealVector
Any dense, real values vector.
Definition: matrix_types.h:40
types::SparseRowMajor< real_t > SparseFeatures
Sparse Feature Matrix in Row Major format.
Definition: matrix_types.h:50
float real_t
The default type for floating point values.
Definition: config.h:17
#define THROW_EXCEPTION(exception_type,...)
Definition: throw_error.h:16