7 #include "nlohmann/json.hpp" 
   16 double obesity(
const std::vector<long>& values, 
int num_samples);
 
   18 int main(
int argc, 
const char** argv) {
 
   19     std::string DataSetFile;
 
   20     std::string OutputFile;
 
   21     bool OneBasedIndex = 
false;
 
   22     CLI::App app{
"labelstats"};
 
   23     app.add_option(
"dataset", DataSetFile,
 
   24                    "The file from which the data will be loaded.")->required()->check(CLI::ExistingFile);
 
   25     app.add_option(
"target", OutputFile,
 
   26                    "The file to which the result will be saved.")->required();
 
   28     app.add_flag(
"--one-based-index", OneBasedIndex,
 
   29                  "If this flag is given, then we assume that the input dataset in xmc format and" 
   30                  " has one-based indexing, i.e. the first label and feature are at index 1  (as opposed to the usual 0)");
 
   33         app.parse(argc, argv);
 
   34     } 
catch (
const CLI::ParseError &e) {
 
   39     std::vector<long> label_counts;
 
   40     for(
long id = 0; 
id < data.num_labels(); ++id) {
 
   41         label_counts.push_back(
static_cast<long>(data.num_positives(
label_id_t{id})));
 
   44     std::sort(begin(label_counts), end(label_counts));
 
   47     result[
"num-labels"] = data.num_labels();
 
   48     result[
"num-instances"] = data.num_examples();
 
   49     result[
"most-frequent"] = label_counts.back();
 
   50     result[
"least-frequent"] =  label_counts.front();
 
   51     result[
"intra-IR-min"] = double(data.num_examples()) / double(std::max(1l, label_counts.back()));
 
   52     result[
"intra-IR-max"] = double(data.num_examples()) / double(std::max(1l, label_counts.front()));
 
   53     result[
"inter-IR"] = double(label_counts.back()) / double(std::max(1l, label_counts.front()));
 
   56     std::vector<long> cumulative;
 
   57     std::partial_sum(label_counts.rbegin(), label_counts.rend(), std::back_inserter(cumulative));
 
   59     std::cout << cumulative[0] << 
" " << cumulative[1] << 
" " << cumulative[cumulative.size() - 1] << 
"\n";
 
   60     for(
int i = 0; i < 
ssize(cumulative); ++i) {
 
   61         if(cumulative[i] / target >= cumulative.back() / 100) {
 
   63             result[
"cumulative-rel-" + 
std::to_string(target)] = 100.0 * double(i) / double(data.num_labels());
 
   68     result[
"obesity"] = 
obesity(label_counts, 10000);
 
   70     std::fstream result_file(OutputFile, std::fstream::out);
 
   71     result_file << std::setw(4) << result << 
"\n";
 
   77 double obesity(
const std::vector<long>& values, 
int num_samples) {
 
   79     std::uniform_int_distribution<long> dist(0, 
ssize(values) - 1);
 
   80     std::array<long, 4> sample{};
 
   82     for(
int i = 0; i < num_samples; ++i) {
 
   83         for(
auto& s : sample)  s = dist(rng);
 
   84         std::sort(begin(sample), end(sample));
 
   85         if(values[sample[0]] + values[sample[3]] > values[sample[1]] + values[sample[2]]) {
 
   90     return double(larger) / double(num_samples / 100);
 
Strong typedef for an int to signify a label id.
double obesity(const std::vector< long > &values, int num_samples)
int main(int argc, const char **argv)
const char * to_string(WeightFormat format)
MultiLabelData read_xmc_dataset(const std::filesystem::path &source, IndexMode mode=IndexMode::ZERO_BASED)
Reads a dataset given in the extreme multilabel classification format.
@ ONE_BASED
labels and feature indices are 1, 2, ..., num
@ ZERO_BASED
labels and feature indices are 0, 1, ..., num - 1
Main namespace in which all types, classes, and functions are defined.
constexpr auto ssize(const C &c) -> std::common_type_t< std::ptrdiff_t, std::make_signed_t< decltype(c.size())>>
signed size free function. Taken from https://en.cppreference.com/w/cpp/iterator/size