DiSMEC++
|
Main namespace in which all types, classes, and functions are defined. More...
Namespaces | |
confusion_matrix_detail | |
eigen_visitors | |
init | |
io | |
l2_reg_sq_hinge_detail | |
model | |
objective | |
parallel | |
postproc | |
prediction | |
solvers | |
stats | |
types | |
Classes | |
class | DataProcessing |
class | DatasetBase |
class | BinaryData |
Collects the data related to a single optimization problem. More... | |
class | MultiLabelData |
class | label_id_t |
Strong typedef for an int to signify a label id. More... | |
class | CascadeTraining |
class | DiSMECTraining |
An implementation of TrainingSpec that models the DiSMEC algorithm. More... | |
class | TrainingSpec |
This class gathers the setting-specific parts of the training process. More... | |
struct | DismecTrainingConfig |
struct | CascadeTrainingConfig |
class | ResultStatsGatherer |
class | TrainingStatsGatherer |
class | TrainingTaskGenerator |
Generates tasks for training weights for the i'th label. More... | |
struct | TrainingResult |
class | PropensityModel |
class | WeightingScheme |
Base class for label-based weighting schemes. More... | |
class | ConstantWeighting |
Simple weighting scheme that assigns the same weighting to all label_id s. More... | |
class | PropensityWeighting |
class | PropensityDownWeighting |
class | CustomWeighting |
class | FastSparseRowIter |
This is an almost verbatim copy of the SparseFeatures::InnerIterator provided by Eigen. More... | |
class | HashVector |
An Eigen vector with versioning information, to implement simple caching of results. More... | |
class | VectorHash |
A unique identifier for a HashVector. More... | |
class | CacheHelper |
class | HyperParameterBase |
Base class for all objects that have adjustable hyper-parameters. More... | |
class | HyperParameters |
This class represents a set of hyper-parameters. More... | |
class | opaque_int_type |
An integer-like type that represents categorical values. More... | |
class | KahanAccumulator |
Implements a numerically stable sum algorithm. More... | |
Enumerations | |
enum class | DatasetTransform { IDENTITY , ONE_PLUS_LOG , LOG_ONE_PLUS , SQRT } |
enum class | RegularizerType { REG_L2 , REG_L1 , REG_L1_RELAXED , REG_HUBER , REG_ELASTIC_50_50 , REG_ELASTIC_90_10 } |
enum class | LossType { SQUARED_HINGE , LOGISTIC , HUBER_HINGE , HINGE } |
Functions | |
void | augment_features_with_bias (DatasetBase &data, real_t bias=1) |
SparseFeatures | augment_features_with_bias (const SparseFeatures &features, real_t bias=1) |
DenseFeatures | augment_features_with_bias (const DenseFeatures &features, real_t bias=1) |
DenseRealVector | get_mean_feature (const GenericFeatureMatrix &features) |
DenseRealVector | get_mean_feature (const SparseFeatures &features) |
DenseRealVector | get_mean_feature (const DenseFeatures &features) |
std::vector< long > | count_features (const SparseFeatures &features) |
void | normalize_instances (DatasetBase &data) |
void | normalize_instances (SparseFeatures &features) |
void | normalize_instances (DenseFeatures &features) |
Eigen::PermutationMatrix< Eigen::Dynamic, Eigen::Dynamic, int > | sort_features_by_frequency (DatasetBase &data) |
Eigen::PermutationMatrix< Eigen::Dynamic, Eigen::Dynamic, int > | sort_features_by_frequency (SparseFeatures &features) |
Eigen::PermutationMatrix< Eigen::Dynamic, Eigen::Dynamic, int > | sort_features_by_frequency (DenseFeatures &features) |
void | hash_sparse_features (SparseFeatures &features, unsigned seed, int buckets, int repeats) |
SparseFeatures | shortlist_features (const SparseFeatures &source, const std::vector< long > &shortlist) |
DenseFeatures | shortlist_features (const DenseFeatures &source, const std::vector< long > &shortlist) |
void | transform_features (DatasetBase &data, DatasetTransform transform) |
void | transform_features (SparseFeatures &features, DatasetTransform transform) |
void | transform_features (DenseFeatures &features, DatasetTransform transform) |
std::ptrdiff_t | operator- (label_id_t a, label_id_t b) |
label_id_t | operator+ (label_id_t a, std::ptrdiff_t b) |
std::shared_ptr< objective::Objective > | make_loss (LossType type, std::shared_ptr< const GenericFeatureMatrix > X, std::unique_ptr< objective::Objective > regularizer) |
std::shared_ptr< TrainingSpec > | create_dismec_training (std::shared_ptr< const DatasetBase > data, HyperParameters params, DismecTrainingConfig config) |
std::shared_ptr< TrainingSpec > | create_cascade_training (std::shared_ptr< const DatasetBase > data, std::shared_ptr< const GenericFeatureMatrix > dense, std::shared_ptr< const std::vector< std::vector< long >>> shortlist, HyperParameters params, CascadeTrainingConfig config) |
TrainingResult | run_training (parallel::ParallelRunner &runner, std::shared_ptr< TrainingSpec > spec, label_id_t begin_label=label_id_t{0}, label_id_t end_label=label_id_t{-1}) |
template<class T > | |
constexpr long | to_long (T value) |
Convert the given value to long , throwing an error if the conversion is not possible. More... | |
template<class T > | |
constexpr std::ptrdiff_t | calc_ssizeof () |
Gets the sizeof of a type as a signed integer. More... | |
template<class C > | |
constexpr auto | ssize (const C &c) -> std::common_type_t< std::ptrdiff_t, std::make_signed_t< decltype(c.size())>> |
signed size free function. Taken from https://en.cppreference.com/w/cpp/iterator/size More... | |
template<typename Scalar , int Options, typename StorageIndex , typename OtherDerived > | |
auto | fast_dot (const Eigen::SparseMatrix< Scalar, Options, StorageIndex > &first, int row, const Eigen::MatrixBase< OtherDerived > &other) -> Scalar |
template<class T > | |
auto | operator+ (const HashVector &vec, T &&other) |
template<class T > | |
auto | operator+ (T &&other, const HashVector &vec) |
template<class T > | |
auto | operator+= (T &&other, const HashVector &vec) |
template<class T > | |
auto | operator* (const HashVector &vec, T &&other) |
template<class T > | |
auto | operator* (T &&other, const HashVector &vec) |
template<class T > | |
auto | operator*= (T &&other, const HashVector &vec) |
template<class Tag , class T > | |
constexpr bool | operator== (opaque_int_type< Tag, T > a, opaque_int_type< Tag, T > b) |
template<class Tag , class T > | |
constexpr bool | operator!= (opaque_int_type< Tag, T > a, opaque_int_type< Tag, T > b) |
template<class Tag , class T > | |
constexpr bool | operator<= (opaque_int_type< Tag, T > a, opaque_int_type< Tag, T > b) |
template<class Tag , class T > | |
constexpr bool | operator< (opaque_int_type< Tag, T > a, opaque_int_type< Tag, T > b) |
template<class Tag , class T > | |
constexpr bool | operator> (opaque_int_type< Tag, T > a, opaque_int_type< Tag, T > b) |
template<class Tag , class T > | |
constexpr bool | operator>= (opaque_int_type< Tag, T > a, opaque_int_type< Tag, T > b) |
template<class Tag , class T > | |
std::ostream & | operator<< (std::ostream &stream, opaque_int_type< Tag, T > a) |
SparseFeatures | make_uniform_sparse_matrix (int rows, int cols, int non_zeros_per_row) |
Creates a sparse matrix with the given number of rows and columns. More... | |
Variables | |
constexpr const long | CG_MIN_ITER_BOUND = 5 |
constexpr const real_t | CG_DEFAULT_EPSILON = 0.5 |
The default value for the tolerance parameter of the conjugate gradient optimization. More... | |
constexpr const int | MIN_TIME_PER_CHUNK_MS = 5 |
If the time needed per chunk of work is less than this, we display a warning. More... | |
constexpr const int | PREDICTION_RUN_CHUNK_SIZE = 1024 |
Default chunk size for predicting scores. More... | |
constexpr const int | PREDICTION_METRICS_CHUNK_SIZE = 4096 |
Default chunk size for calculating metrics. More... | |
template<class T > | |
constexpr std::ptrdiff_t | ssizeof = calc_ssizeof<T>() |
Signed size of type T More... | |
Main namespace in which all types, classes, and functions are defined.
using dismec::BinaryLabelVector = typedef types::DenseVector<std::int8_t> |
Dense vector for storing binary labels.
We use this type to store a dense representation of a binary label where +1 represents presence of the label and -1 represents its absence.
Definition at line 68 of file matrix_types.h.
using dismec::DenseFeatures = typedef types::DenseRowMajor<real_t> |
Dense Feature Matrix in Row Major format.
This is the format in which we store the features of a dense dataset. We use a RowMajor format, because we usually want to iterate over all the features of one example, but not over all the examples of a given feature. Each row corresponds to one instance, and each column to a feature.
Definition at line 58 of file matrix_types.h.
using dismec::DenseRealVector = typedef types::DenseVector<real_t> |
Any dense, real values vector.
Definition at line 40 of file matrix_types.h.
using dismec::GenericFeatureMatrix = typedef types::GenericMatrix<DenseFeatures, SparseFeatures> |
Definition at line 60 of file matrix_types.h.
using dismec::GenericInMatrix = typedef types::GenericMatrixRef<const real_t> |
Definition at line 33 of file matrix_types.h.
using dismec::GenericInVector = typedef types::GenericVectorRef<const real_t> |
Definition at line 35 of file matrix_types.h.
using dismec::GenericOutMatrix = typedef types::GenericMatrixRef<real_t> |
Definition at line 32 of file matrix_types.h.
using dismec::GenericOutVector = typedef types::GenericVectorRef<real_t> |
Definition at line 34 of file matrix_types.h.
using dismec::GenericRealVector = typedef types::GenericVector<real_t> |
Definition at line 42 of file matrix_types.h.
using dismec::IndexMatrix = typedef types::DenseRowMajor<long> |
Matrix used for indices in sparse predictions.
This matrix is used for predictions, thus it is in row-major order.
Definition at line 81 of file matrix_types.h.
using dismec::PredictionMatrix = typedef types::DenseRowMajor<real_t> |
Dense matrix in Row Major format used for predictions.
This is the matrix type used for dense predictions. To facility predictions on an instance-by-instance basis, this is a RowMajor matrix type. Each row corresponds to one instance, and each column to a label.
Definition at line 75 of file matrix_types.h.
typedef float dismec::real_t |
using dismec::RegularizerSpec = typedef std::variant<objective::SquaredNormConfig, objective::HuberConfig, objective::ElasticConfig> |
using dismec::SparseFeatures = typedef types::SparseRowMajor<real_t> |
Sparse Feature Matrix in Row Major format.
This is the format in which we store the features of a sparse dataset. We use a RowMajor format, because we usually want to iterate over all the features of one example, but not over all the examples of a given feature. Each row corresponds to one instance, and each column to a feature.
Definition at line 50 of file matrix_types.h.
using dismec::SparseRealVector = typedef types::SparseVector<real_t> |
Definition at line 41 of file matrix_types.h.
|
strong |
Enumerator | |
---|---|
IDENTITY | |
ONE_PLUS_LOG | |
LOG_ONE_PLUS | |
SQRT |
Definition at line 37 of file transform.h.
|
strong |
|
strong |
DenseFeatures dismec::augment_features_with_bias | ( | const DenseFeatures & | features, |
real_t | bias = 1 |
||
) |
Definition at line 44 of file transform.cpp.
SparseFeatures dismec::augment_features_with_bias | ( | const SparseFeatures & | features, |
real_t | bias = 1 |
||
) |
Definition at line 29 of file transform.cpp.
void dismec::augment_features_with_bias | ( | DatasetBase & | data, |
real_t | bias = 1 |
||
) |
Definition at line 25 of file transform.cpp.
References dismec::DatasetBase::edit_features(), and dismec::types::visit().
Referenced by dismec::DataProcessing::load(), anonymous_namespace{transform.cpp}::VisitorBias::operator()(), TrainingProgram::run(), and TEST_CASE().
|
constexpr |
Gets the sizeof
of a type as a signed integer.
Definition at line 32 of file conversion.h.
std::vector< long > dismec::count_features | ( | const SparseFeatures & | features | ) |
Definition at line 114 of file transform.cpp.
Referenced by main(), and sort_features_by_frequency().
std::shared_ptr< TrainingSpec > dismec::create_cascade_training | ( | std::shared_ptr< const DatasetBase > | data, |
std::shared_ptr< const GenericFeatureMatrix > | dense, | ||
std::shared_ptr< const std::vector< std::vector< long >>> | shortlist, | ||
HyperParameters | params, | ||
CascadeTrainingConfig | config | ||
) |
Definition at line 161 of file cascade.cpp.
References dismec::postproc::create_identity(), dismec::init::create_zero_initializer(), dismec::CascadeTrainingConfig::DenseInit, dismec::CascadeTrainingConfig::DenseReg, dismec::CascadeTrainingConfig::PostProcessing, dismec::CascadeTrainingConfig::SparseInit, dismec::CascadeTrainingConfig::SparseReg, and dismec::CascadeTrainingConfig::StatsGatherer.
Referenced by TrainingProgram::run().
std::shared_ptr< TrainingSpec > dismec::create_dismec_training | ( | std::shared_ptr< const DatasetBase > | data, |
HyperParameters | params, | ||
DismecTrainingConfig | config | ||
) |
Definition at line 157 of file dismec.cpp.
References dismec::postproc::create_identity(), dismec::init::create_zero_initializer(), dismec::DismecTrainingConfig::Init, dismec::DismecTrainingConfig::Loss, dismec::DismecTrainingConfig::PostProcessing, dismec::DismecTrainingConfig::Regularizer, dismec::DismecTrainingConfig::Sparse, dismec::DismecTrainingConfig::StatsGatherer, and dismec::DismecTrainingConfig::Weighting.
Referenced by register_training().
auto dismec::fast_dot | ( | const Eigen::SparseMatrix< Scalar, Options, StorageIndex > & | first, |
int | row, | ||
const Eigen::MatrixBase< OtherDerived > & | other | ||
) | -> Scalar |
Definition at line 55 of file fast_sparse_row_iter.h.
Referenced by dismec::l2_reg_sq_hinge_detail::__attribute__().
DenseRealVector dismec::get_mean_feature | ( | const DenseFeatures & | features | ) |
Definition at line 75 of file transform.cpp.
DenseRealVector dismec::get_mean_feature | ( | const GenericFeatureMatrix & | features | ) |
Definition at line 52 of file transform.cpp.
References dismec::types::visit().
Referenced by TrainingProgram::make_config(), and dismec::init::SubsetFeatureMeanStrategy::SubsetFeatureMeanStrategy().
DenseRealVector dismec::get_mean_feature | ( | const SparseFeatures & | features | ) |
Definition at line 57 of file transform.cpp.
void dismec::hash_sparse_features | ( | SparseFeatures & | features, |
unsigned | seed, | ||
int | buckets, | ||
int | repeats | ||
) |
Definition at line 183 of file transform.cpp.
Referenced by dismec::DataProcessing::load().
std::shared_ptr< objective::Objective > dismec::make_loss | ( | LossType | type, |
std::shared_ptr< const GenericFeatureMatrix > | X, | ||
std::unique_ptr< objective::Objective > | regularizer | ||
) |
Definition at line 41 of file dismec.cpp.
References HINGE, HUBER_HINGE, LOGISTIC, dismec::objective::make_huber_hinge(), dismec::objective::make_logistic_loss(), dismec::objective::make_squared_hinge(), SQUARED_HINGE, and THROW_EXCEPTION.
Referenced by dismec::init::create_ova_primal_initializer(), and dismec::DiSMECTraining::make_objective().
dismec::SparseFeatures dismec::make_uniform_sparse_matrix | ( | int | rows, |
int | cols, | ||
int | non_zeros_per_row | ||
) |
Creates a sparse matrix with the given number of rows and columns.
rows | Number of rows in the matrix. |
cols | Number of columns in the martix. |
non_zeros_per_row | Number of nonzero entries in each row. The non-zeros will be distributed uniformly among the columns. |
Definition at line 8 of file test_utils.cpp.
Referenced by TEST_CASE().
void dismec::normalize_instances | ( | DatasetBase & | data | ) |
Definition at line 88 of file transform.cpp.
References dismec::DatasetBase::edit_features(), and dismec::types::visit().
Referenced by apply_tfidf(), dismec::DataProcessing::load(), and TrainingProgram::run().
void dismec::normalize_instances | ( | DenseFeatures & | features | ) |
Definition at line 101 of file transform.cpp.
void dismec::normalize_instances | ( | SparseFeatures & | features | ) |
Definition at line 92 of file transform.cpp.
|
inlineconstexpr |
Definition at line 45 of file opaque_int.h.
References dismec::opaque_int_type< Tag, T >::to_index().
auto dismec::operator* | ( | const HashVector & | vec, |
T && | other | ||
) |
Definition at line 167 of file hash_vector.h.
References dismec::HashVector::get().
auto dismec::operator* | ( | T && | other, |
const HashVector & | vec | ||
) |
Definition at line 172 of file hash_vector.h.
References dismec::HashVector::get().
auto dismec::operator*= | ( | T && | other, |
const HashVector & | vec | ||
) |
Definition at line 177 of file hash_vector.h.
References dismec::HashVector::get().
auto dismec::operator+ | ( | const HashVector & | vec, |
T && | other | ||
) |
Definition at line 152 of file hash_vector.h.
References dismec::HashVector::get().
|
inline |
Definition at line 34 of file types.h.
References dismec::opaque_int_type< Tag, T >::to_index().
auto dismec::operator+ | ( | T && | other, |
const HashVector & | vec | ||
) |
Definition at line 157 of file hash_vector.h.
References dismec::HashVector::get().
auto dismec::operator+= | ( | T && | other, |
const HashVector & | vec | ||
) |
Definition at line 162 of file hash_vector.h.
References dismec::HashVector::get().
|
inline |
Definition at line 30 of file types.h.
References dismec::opaque_int_type< Tag, T >::to_index().
|
inlineconstexpr |
Definition at line 55 of file opaque_int.h.
References dismec::opaque_int_type< Tag, T >::to_index().
std::ostream& dismec::operator<< | ( | std::ostream & | stream, |
opaque_int_type< Tag, T > | a | ||
) |
Definition at line 70 of file opaque_int.h.
References dismec::opaque_int_type< Tag, T >::to_index().
|
inlineconstexpr |
Definition at line 50 of file opaque_int.h.
References dismec::opaque_int_type< Tag, T >::to_index().
|
inlineconstexpr |
Definition at line 40 of file opaque_int.h.
References dismec::opaque_int_type< Tag, T >::to_index().
|
inlineconstexpr |
Definition at line 60 of file opaque_int.h.
References dismec::opaque_int_type< Tag, T >::to_index().
|
inlineconstexpr |
Definition at line 65 of file opaque_int.h.
References dismec::opaque_int_type< Tag, T >::to_index().
TrainingResult dismec::run_training | ( | parallel::ParallelRunner & | runner, |
std::shared_ptr< TrainingSpec > | spec, | ||
label_id_t | begin_label = label_id_t{0} , |
||
label_id_t | end_label = label_id_t{-1} |
||
) |
Definition at line 122 of file training.cpp.
References dismec::parallel::ParallelRunner::run().
Referenced by register_training(), and TrainingProgram::run().
DenseFeatures dismec::shortlist_features | ( | const DenseFeatures & | source, |
const std::vector< long > & | shortlist | ||
) |
Definition at line 235 of file transform.cpp.
SparseFeatures dismec::shortlist_features | ( | const SparseFeatures & | source, |
const std::vector< long > & | shortlist | ||
) |
Definition at line 219 of file transform.cpp.
Referenced by dismec::CascadeTraining::update_objective().
Eigen::PermutationMatrix< Eigen::Dynamic, Eigen::Dynamic, int > dismec::sort_features_by_frequency | ( | DatasetBase & | data | ) |
Definition at line 110 of file transform.cpp.
References dismec::DatasetBase::edit_features(), and dismec::types::visit().
Referenced by TEST_CASE().
Eigen::PermutationMatrix< Eigen::Dynamic, Eigen::Dynamic, int > dismec::sort_features_by_frequency | ( | DenseFeatures & | features | ) |
Definition at line 146 of file transform.cpp.
Eigen::PermutationMatrix< Eigen::Dynamic, Eigen::Dynamic, int > dismec::sort_features_by_frequency | ( | SparseFeatures & | features | ) |
Definition at line 127 of file transform.cpp.
References count_features().
|
constexpr |
signed size free function. Taken from https://en.cppreference.com/w/cpp/iterator/size
Definition at line 42 of file conversion.h.
Referenced by dismec::l2_reg_sq_hinge_detail::__attribute__(), dismec::TrainingStatsGatherer::add_accu(), dismec::stats::StatisticsCollection::declare_stat(), dismec::stats::StatisticsCollection::declare_tag(), dismec::prediction::EvaluateMetrics::EvaluateMetrics(), dismec::prediction::EvaluateMetrics::finalize(), dismec::prediction::ConfusionMatrixRecorder::get_confusion_matrix(), dismec::prediction::MacroMetricReporter::get_values(), dismec::l2_reg_sq_hinge_detail::htd_sum(), dismec::prediction::InstanceRankedPositives::InstanceRankedPositives(), main(), anonymous_namespace{regularizers_imp.cpp}::make_vec(), dismec::stats::TaggedStat::merge_imp(), dismec::MultiLabelData::num_labels(), dismec::MultiLabelData::num_negatives(), dismec::MultiLabelData::num_positives(), obesity(), anonymous_namespace{sparse.cpp}::PredictVisitor::operator()(), anonymous_namespace{xmc.cpp}::read_into_buffers(), anonymous_namespace{numpy.cpp}::read_key_value(), dismec::io::read_xmc_dataset(), dismec::stats::TaggedStat::record_real(), dismec::prediction::ConfusionMatrixRecorder::reduce(), anonymous_namespace{numpy.cpp}::skip_whitespace(), anonymous_namespace{collection.cpp}::str_to_id(), TEST_CASE(), dismec::prediction::InstanceRankedPositives::update(), dismec::prediction::AbandonmentAtK::update(), dismec::CascadeTraining::update_objective(), and anonymous_namespace{xmc.cpp}::write_label_list().
|
constexpr |
Convert the given value to long
, throwing an error if the conversion is not possible.
Definition at line 14 of file conversion.h.
References THROW_EXCEPTION.
Referenced by dismec::objective::Regularized_SquaredHingeSVC::gradient_and_pre_conditioner_tpl(), anonymous_namespace{numpy.cpp}::read_key_value(), dismec::parallel::ParallelRunner::run(), and dismec::l2_reg_sq_hinge_detail::value_from_xTw().
void dismec::transform_features | ( | DatasetBase & | data, |
DatasetTransform | transform | ||
) |
Definition at line 152 of file transform.cpp.
References dismec::DatasetBase::edit_features(), and dismec::types::visit().
Referenced by dismec::DataProcessing::load(), and TrainingProgram::run().
void dismec::transform_features | ( | DenseFeatures & | features, |
DatasetTransform | transform | ||
) |
Definition at line 179 of file transform.cpp.
References anonymous_namespace{transform.cpp}::transform_features_imp().
void dismec::transform_features | ( | SparseFeatures & | features, |
DatasetTransform | transform | ||
) |
Definition at line 175 of file transform.cpp.
References anonymous_namespace{transform.cpp}::transform_features_imp().
|
constexpr |
|
constexpr |
The minimum upper bound for the number of CG iterations. If the problem has less than this many dimensions, the upper-bound for the number of CG iterations is still given by this number. TODO is it even sensible to define something like this?
Definition at line 22 of file config.h.
Referenced by dismec::solvers::CGMinimizer::do_minimize().
|
constexpr |
If the time needed per chunk of work is less than this, we display a warning.
Definition at line 28 of file config.h.
Referenced by dismec::parallel::ParallelRunner::run().
|
constexpr |
|
constexpr |
|
constexpr |
Signed size of type T
Definition at line 38 of file conversion.h.