DiSMEC++
msi.cpp
Go to the documentation of this file.
1 // Copyright (c) 2021, Aalto University, developed by Erik Schultheis
2 // All rights reserved.
3 //
4 // SPDX-License-Identifier: MIT
5 
6 #include "subset.h"
7 #include "stats/collection.h"
8 #include "stats/timer.h"
9 #include "data/types.h"
10 #include "data/data.h"
11 #include "objective/objective.h"
12 #include "utils/hash_vector.h"
13 
14 
15 using namespace dismec::init;
16 
17 namespace dismec::init {
19  public:
20  MeanOfFeaturesInitializer(std::shared_ptr<const DatasetBase> data,
21  const DenseRealVector& mean_of_all,
22  std::shared_ptr<const GenericFeatureMatrix> local_features,
23  real_t pos, real_t neg);
24 
25  void get_initial_weight(label_id_t label_id, Eigen::Ref<DenseRealVector> target, objective::Objective& objective) override;
26  private:
29  static constexpr stats::stat_id_t STAT_NUM_POS{3};
31  };
32 
34  public:
36 
37  [[nodiscard]] std::unique_ptr<WeightsInitializer>
38  make_initializer(const std::shared_ptr<const GenericFeatureMatrix>& features) const override;
39  };
40 
41 }
42 
43 MeanOfFeaturesInitializer::MeanOfFeaturesInitializer(std::shared_ptr<const DatasetBase> data,
44  const DenseRealVector& mean_of_all,
45  std::shared_ptr<const GenericFeatureMatrix> local_features,
46  real_t pos, real_t neg) :
47  SubsetFeatureMeanInitializer(std::move(data), mean_of_all, std::move(local_features), pos, neg)
48 {
49  declare_stat(STAT_POSITIVE_FACTOR, {"positive", {}});
50  declare_stat(STAT_ALL_MEAN_FACTOR, {"all_mean", {}});
51  declare_stat(STAT_NUM_POS, {"num_pos", "#positives"});
52  declare_stat(STAT_LOSS_REDUCTION, {"loss_reduction", "(f(0)-f(w))/f(0) [%]"});
53 }
54 
55 void MeanOfFeaturesInitializer::get_initial_weight(label_id_t label_id, Eigen::Ref<DenseRealVector> target, objective::Objective& objective)
56 {
57  auto timer = make_timer(STAT_DURATION);
58  m_DataSet->get_labels(label_id, m_LabelBuffer);
59 
60  target.setZero();
61  int num_pos = m_DataSet->num_positives(label_id);
62  visit([&](const auto& matrix) {
63  // I've put the entire loop into the visit so that the sparse/dense dispatch happens only once
64  for(int i = 0; i < m_LabelBuffer.size(); ++i) {
65  if(m_LabelBuffer.coeff(i) > 0.0) {
66  target += matrix.row(i) / (real_t)num_pos;
67  }
68  }
69  }, *m_LocalFeatures);
70 
71  auto [p, a] = calculate_factors(label_id, target);
72  target = target * p + m_MeanOfAll * a;
73 
76  record(STAT_NUM_POS, num_pos);
78  HashVector temp{target};
79  real_t obj_at_new = objective.value(temp);
80  temp.modify().setZero();
81  real_t obj_at_zero = objective.value(temp);
82  return 100.f * (obj_at_zero - obj_at_new) / obj_at_zero;
83  });
84 }
85 
86 std::unique_ptr<WeightsInitializer> MeanOfFeaturesStrategy::make_initializer(const std::shared_ptr<const GenericFeatureMatrix>& features) const {
87  return std::make_unique<MeanOfFeaturesInitializer>(m_DataSet, m_MeanOfAllInstances, features, m_PositiveTarget, m_NegativeTarget);
88 }
89 
90 std::shared_ptr<WeightInitializationStrategy> dismec::init::create_feature_mean_initializer(std::shared_ptr<DatasetBase> data, real_t pos, real_t neg) {
91  return std::make_shared<MeanOfFeaturesStrategy>(std::move(data), pos, neg);
92 }
An Eigen vector with versioning information, to implement simple caching of results.
Definition: hash_vector.h:43
void get_initial_weight(label_id_t label_id, Eigen::Ref< DenseRealVector > target, objective::Objective &objective) override
Generate an initial vector for the given label. The result should be placed in target.
Definition: msi.cpp:55
static constexpr stats::stat_id_t STAT_LOSS_REDUCTION
Definition: msi.cpp:30
static constexpr stats::stat_id_t STAT_ALL_MEAN_FACTOR
Definition: msi.cpp:28
static constexpr stats::stat_id_t STAT_NUM_POS
Definition: msi.cpp:29
MeanOfFeaturesInitializer(std::shared_ptr< const DatasetBase > data, const DenseRealVector &mean_of_all, std::shared_ptr< const GenericFeatureMatrix > local_features, real_t pos, real_t neg)
Definition: msi.cpp:43
static constexpr stats::stat_id_t STAT_POSITIVE_FACTOR
Definition: msi.cpp:27
std::unique_ptr< WeightsInitializer > make_initializer(const std::shared_ptr< const GenericFeatureMatrix > &features) const override
Creats a new, thread local WeightsInitializer.
Definition: msi.cpp:86
std::shared_ptr< const DatasetBase > m_DataSet
Definition: subset.h:19
std::shared_ptr< const GenericFeatureMatrix > m_LocalFeatures
Definition: subset.h:20
std::pair< real_t, real_t > calculate_factors(label_id_t label_id, const Eigen::Ref< DenseRealVector > &mean_of_positives)
Definition: subset.cpp:43
static constexpr stats::stat_id_t STAT_DURATION
Definition: subset.h:29
SubsetFeatureMeanStrategy(std::shared_ptr< const DatasetBase > data, real_t negative_target, real_t positive_target)
Definition: subset.cpp:76
std::shared_ptr< const DatasetBase > m_DataSet
Definition: subset.h:42
DenseRealVector m_MeanOfAllInstances
Definition: subset.h:43
Strong typedef for an int to signify a label id.
Definition: types.h:20
Class that models an optimization objective.
Definition: objective.h:41
void record(stat_id_t stat, T &&value)
Record statistics. This function just forwards all its arguments to the internal StatisticsCollection...
Definition: tracked.h:90
auto make_timer(stat_id_t id, Args... args)
Creates a new ScopeTimer using stats::record_scope_time.
Definition: tracked.h:130
void declare_stat(stat_id_t index, StatisticMetaData meta)
Declares a new statistics. This function just forwards all its arguments to the internal StatisticsCo...
Definition: tracked.cpp:16
std::shared_ptr< WeightInitializationStrategy > create_feature_mean_initializer(std::shared_ptr< DatasetBase > data, real_t pos=1, real_t neg=-2)
Creates an initialization strategy based on the mean of positive and negative features.
Definition: msi.cpp:90
auto visit(F &&f, Variants &&... variants)
Definition: eigen_generic.h:95
types::DenseVector< real_t > DenseRealVector
Any dense, real values vector.
Definition: matrix_types.h:40
float real_t
The default type for floating point values.
Definition: config.h:17