DiSMEC++
weighting.cpp
Go to the documentation of this file.
1 // Copyright (c) 2021, Aalto University, developed by Erik Schultheis
2 // All rights reserved.
3 //
4 // SPDX-License-Identifier: MIT
5 
6 #include "weighting.h"
7 #include <stdexcept>
8 #include <cmath>
9 #include "spdlog/spdlog.h"
10 #include "data/data.h"
11 
12 using namespace dismec;
13 
14 PropensityModel::PropensityModel(const DatasetBase* data, double a, double b) :
15  m_Data(data), m_A(a), m_B(b)
16 {
17  if(!m_Data) {
18  throw std::invalid_argument("data must not be nullptr");
19  }
20  m_C = (std::log(data->num_examples()) - 1) * std::pow(m_B + 1, m_A);
21 }
22 
24 
25  double d = m_C * std::exp(-m_A * std::log(m_Data->num_positives(label_id) + m_B));
26  return 1.0 / (1.0 + d);
27 }
28 
30  return m_PositiveCost;
31 }
32 
34  return m_NegativeCost;
35 }
36 
37 ConstantWeighting::ConstantWeighting(double positive_cost, double negative_cost) :
38  m_PositiveCost(positive_cost), m_NegativeCost(negative_cost) {
39  if(positive_cost < 0 || negative_cost < 0) {
40  throw std::invalid_argument("Negative cost");
41  }
42 }
43 
45 
46 }
47 
49  return 2.0 / m_Propensity.get_propensity(label_id) - 1.0;
50 }
51 
53  return 1.0;
54 }
55 
57 
58 }
59 
61  return 1.0;
62 }
63 
65  // p / (2-p)
66  double p = m_Propensity.get_propensity(label_id);
67  return p / (2.0 - p);
68 }
69 
71  m_PositiveWeights(std::move(positive_weights)), m_NegativeWeights(std::move(negative_weights)) {
72  // we do not know how many labels there are, but in any case the number should be the same for pos and neg
73  if(m_PositiveWeights.size() != m_NegativeWeights.size()) {
74  throw std::logic_error(fmt::format("Mismatched number of entries: {} in positive and {} in negative weights",
76  ));
77  }
78 }
79 
81  auto index = label_id.to_index();
82  if(index < 0 || index >= m_PositiveWeights.size()) {
83  throw std::logic_error(fmt::format("Trying to get positive weight for label {}, but only {} weights are known.",
84  index, m_PositiveWeights.size()
85  ));
86  }
87  return m_PositiveWeights.coeff(index);
88 }
90  auto index = label_id.to_index();
91  if(index < 0 || index >= m_NegativeWeights.size()) {
92  throw std::logic_error(fmt::format("Trying to get positive weight for label {}, but only {} weights are known.",
93  index, m_NegativeWeights.size()
94  ));
95  }
96  return m_NegativeWeights.coeff(index);
97 }
98 
99 #include "doctest.h"
100 
101 TEST_CASE("propensity calculation") {
102  auto features = SparseFeatures(50, 50);
103  auto labels = std::make_shared<BinaryLabelVector>(BinaryLabelVector::Zero(50));
104  labels->coeffRef(0) = 1;
105  BinaryData fake_data(features, labels);
106 
107  // 1 of 50
108  PropensityModel pm{&fake_data, 0.55, 1.5};
109  auto prop = pm.get_propensity(label_id_t{0});
110  CHECK(prop == doctest::Approx(0.25562221863533147));
111 
112  for(int i = 0; i < 25; ++i) {
113  labels->coeffRef(i) = 1;
114  }
115 
116  // 25 / 50
117  prop = pm.get_propensity(label_id_t{0});
118  CHECK(prop == doctest::Approx(0.5571545100089221));
119 }
120 
121 TEST_CASE("constant weighting") {
122  ConstantWeighting cw(2.0, 5.0);
123  CHECK(cw.get_positive_weight(label_id_t{0}) == 2.0);
124  CHECK(cw.get_negative_weight(label_id_t{0}) == 5.0);
125  CHECK(cw.get_positive_weight(label_id_t{10}) == 2.0);
126  CHECK(cw.get_negative_weight(label_id_t{10}) == 5.0);
127 
128  CHECK_THROWS(ConstantWeighting(-1.0, 2.0));
129  CHECK_THROWS(ConstantWeighting(1.0, -2.0));
130 }
131 
132 TEST_CASE("prop weighting") {
133  auto features = SparseFeatures(50, 50);
134  auto labels = std::make_shared<BinaryLabelVector>(BinaryLabelVector::Zero(50));
135  labels->coeffRef(0) = 1;
136  BinaryData fake_data(features, labels);
137  PropensityWeighting pw{PropensityModel(&fake_data, 0.55, 1.5)};
138  CHECK(pw.get_positive_weight(label_id_t{0}) == doctest::Approx(2.0 / 0.25562221863533147 - 1.0));
139  CHECK(pw.get_negative_weight(label_id_t{0}) == doctest::Approx(1.0));
140 }
141 
Collects the data related to a single optimization problem.
Definition: data.h:69
Simple weighting scheme that assigns the same weighting to all label_ids.
Definition: weighting.h:46
double get_positive_weight(label_id_t label_id) const override
Gets the weight to use for all examples where the label label_id is present.
Definition: weighting.cpp:29
ConstantWeighting(double positive_cost, double negative_cost)
Definition: weighting.cpp:37
double m_NegativeCost
Cost to use if the label is absent, independent of the label_id.
Definition: weighting.h:53
double m_PositiveCost
Cost to use if the label is present, independent of the label_id.
Definition: weighting.h:52
double get_negative_weight(label_id_t label_id) const override
Gets the weight to use for all examples where the label label_id is absent.
Definition: weighting.cpp:33
double get_positive_weight(label_id_t label_id) const override
Gets the weight to use for all examples where the label label_id is present.
Definition: weighting.cpp:80
double get_negative_weight(label_id_t label_id) const override
Gets the weight to use for all examples where the label label_id is absent.
Definition: weighting.cpp:89
DenseRealVector m_NegativeWeights
Definition: weighting.h:82
DenseRealVector m_PositiveWeights
Definition: weighting.h:81
CustomWeighting(DenseRealVector positive_weights, DenseRealVector negative_weights)
Definition: weighting.cpp:70
long num_examples() const noexcept
Get the total number of instances, i.e. the number of rows in the feature matrix.
Definition: data.cpp:52
virtual long num_positives(label_id_t id) const
Definition: data.cpp:13
double get_positive_weight(label_id_t label_id) const override
Gets the weight to use for all examples where the label label_id is present.
Definition: weighting.cpp:60
PropensityModel m_Propensity
Definition: weighting.h:71
double get_negative_weight(label_id_t label_id) const override
Gets the weight to use for all examples where the label label_id is absent.
Definition: weighting.cpp:64
PropensityDownWeighting(PropensityModel model)
Definition: weighting.cpp:56
const DatasetBase * m_Data
Definition: weighting.h:19
PropensityModel(const DatasetBase *data, double a=0.55, double b=1.5)
Definition: weighting.cpp:14
double get_propensity(label_id_t label_id) const
Definition: weighting.cpp:23
PropensityModel m_Propensity
Definition: weighting.h:62
double get_negative_weight(label_id_t label_id) const override
Gets the weight to use for all examples where the label label_id is absent.
Definition: weighting.cpp:52
PropensityWeighting(PropensityModel model)
Definition: weighting.cpp:44
double get_positive_weight(label_id_t label_id) const override
Gets the weight to use for all examples where the label label_id is present.
Definition: weighting.cpp:48
Strong typedef for an int to signify a label id.
Definition: types.h:20
constexpr T to_index() const
! Explicitly convert to an integer.
Definition: opaque_int.h:32
Main namespace in which all types, classes, and functions are defined.
Definition: app.h:15
types::DenseVector< real_t > DenseRealVector
Any dense, real values vector.
Definition: matrix_types.h:40
types::SparseRowMajor< real_t > SparseFeatures
Sparse Feature Matrix in Row Major format.
Definition: matrix_types.h:50
TEST_CASE("propensity calculation")
Definition: weighting.cpp:101