31 new_sparse.reserve(features.nonZeros() + features.rows());
32 for (
int k=0; k < features.outerSize(); ++k) {
33 new_sparse.startVec(k);
34 for (SparseFeatures::InnerIterator it(features, k); it; ++it)
36 new_sparse.insertBack(it.row(), it.col()) = it.value();
38 new_sparse.insertBack(k, features.cols()) = bias;
40 new_sparse.finalize();
45 DenseFeatures new_features{features.rows(), features.cols() + 1};
46 new_features.leftCols(features.cols()) = features;
47 new_features.col(features.cols()).setConstant(bias);
61 auto start = features.outerIndexPtr()[0];
62 auto end = features.outerIndexPtr()[features.rows()];
64 const auto* indices = features.innerIndexPtr();
65 const auto* values = features.valuePtr();
66 for(
auto index = start; index < end; ++index) {
67 auto col = indices[index];
68 result[col] += values[index];
71 result /= features.rows();
79 for(
int i = 0; i < features.rows(); ++i) {
80 result += features.row(i);
83 result /= features.rows();
93 for(
int i = 0; i < features.rows(); ++i) {
94 real_t norm = features.row(i).norm();
96 features.row(i) /= norm;
102 for(
int i = 0; i < features.rows(); ++i) {
103 real_t norm = features.row(i).norm();
105 features.row(i) /= norm;
115 std::vector<long> counts(features.cols(), 0);
116 assert(features.isCompressed());
120 const auto* last = features.innerIndexPtr() + features.nonZeros();
121 for(
const auto* start = features.innerIndexPtr(); start != last; ++start) {
128 if(!features.isCompressed()) {
129 features.makeCompressed();
135 std::sort(reorder.begin(), reorder.end(), [&](
int a,
int b){
136 return counts[a] < counts[b];
140 Eigen::PermutationMatrix<Eigen::Dynamic, Eigen::Dynamic, int> permutation(reorder);
142 features = features * permutation;
147 Eigen::PermutationMatrix<Eigen::Dynamic, Eigen::Dynamic, int> permutation(features.cols());
148 permutation.setIdentity();
163 features = features.unaryExpr([](
const real_t& value) {
return std::log1p(value); });
166 features = features.unaryExpr([](
const real_t& value) {
return real_t{1} + std::log(value); });
169 features = features.unaryExpr([](
const real_t& value) {
return std::sqrt(value); });
185 if(!features.isCompressed()) {
186 features.makeCompressed();
189 std::ranlux24 rng(seed);
190 std::uniform_int_distribution<int> mapping(0, buckets - 1);
191 Eigen::MatrixXi hash = Eigen::MatrixXi::NullaryExpr(features.cols(), repeats, [&](){
196 for (
int k=0; k < features.rows(); ++k) {
199 for (SparseFeatures::InnerIterator it(features, k); it; ++it)
201 for(
int j = 0; j < repeats; ++j) {
202 new_row.coeffRef(hash.coeff(it.col(), j) + j * buckets) += it.value();
207 for(
int i = 0; i < new_row.size(); ++i) {
208 if(new_row.coeff(i) > 0) {
209 result.insertBack(k, i) = new_row.coeff(i);
216 features = std::move(result);
221 new_features.reserve(2 * source.nonZeros() *
double(shortlist.size()) /
double(source.rows()));
223 for (
auto row : shortlist) {
224 new_features.startVec(new_row);
225 for (SparseFeatures::InnerIterator it(source, row); it; ++it)
227 new_features.insertBack(new_row, it.col()) = it.value();
231 new_features.finalize();
238 for (
auto row : shortlist) {
239 new_features.row(new_row) = source.row(row);
250 test.insert(3, 2) = 2.0;
251 test.insert(1, 3) = -1.0;
252 test.insert(0, 4) = 5.0;
253 test.insert(2, 2) = 2.0;
254 test.insert(2, 3) = 4.0;
262 CHECK(dense_test.leftCols(Eigen::fix<5>) == dense_ext.leftCols(Eigen::fix<5>));
263 CHECK(dense_ext.col(Eigen::fix<5>) == DenseFeatures::Ones(5, 1));
268 test.insert(3, 2) = 2.0;
269 test.insert(1, 3) = -1.0;
270 test.insert(2, 2) = 2.0;
271 test.insert(1, 2) = 2.0;
272 test.insert(2, 3) = 4.0;
273 test.insert(2, 0) = -4.0;
279 expected.insert(3, 3) = 2.0;
280 expected.insert(1, 2) = -1.0;
281 expected.insert(2, 3) = 2.0;
282 expected.insert(1, 3) = 2.0;
283 expected.insert(2, 2) = 4.0;
284 expected.insert(2, 1) = -4.0;
288 CHECK(test.toDense() == expected.toDense());
std::shared_ptr< GenericFeatureMatrix > edit_features()
get a shared pointer to mutable feature data. Use with care.
outer_const< T, dense_vector_h > DenseVector
auto visit(F &&f, Variants &&... variants)
Main namespace in which all types, classes, and functions are defined.
types::DenseRowMajor< real_t > DenseFeatures
Dense Feature Matrix in Row Major format.
void normalize_instances(DatasetBase &data)
void augment_features_with_bias(DatasetBase &data, real_t bias=1)
void transform_features(DatasetBase &data, DatasetTransform transform)
types::DenseVector< real_t > DenseRealVector
Any dense, real values vector.
std::vector< long > count_features(const SparseFeatures &features)
types::SparseRowMajor< real_t > SparseFeatures
Sparse Feature Matrix in Row Major format.
void hash_sparse_features(SparseFeatures &features, unsigned seed, int buckets, int repeats)
Eigen::PermutationMatrix< Eigen::Dynamic, Eigen::Dynamic, int > sort_features_by_frequency(DatasetBase &data)
SparseFeatures shortlist_features(const SparseFeatures &source, const std::vector< long > &shortlist)
DenseRealVector get_mean_feature(const GenericFeatureMatrix &features)
float real_t
The default type for floating point values.