7 #include "nlohmann/json.hpp"
16 double obesity(
const std::vector<long>& values,
int num_samples);
18 int main(
int argc,
const char** argv) {
19 std::string DataSetFile;
20 std::string OutputFile;
21 bool OneBasedIndex =
false;
22 CLI::App app{
"labelstats"};
23 app.add_option(
"dataset", DataSetFile,
24 "The file from which the data will be loaded.")->required()->check(CLI::ExistingFile);
25 app.add_option(
"target", OutputFile,
26 "The file to which the result will be saved.")->required();
28 app.add_flag(
"--one-based-index", OneBasedIndex,
29 "If this flag is given, then we assume that the input dataset in xmc format and"
30 " has one-based indexing, i.e. the first label and feature are at index 1 (as opposed to the usual 0)");
33 app.parse(argc, argv);
34 }
catch (
const CLI::ParseError &e) {
39 std::vector<long> label_counts;
40 for(
long id = 0;
id < data.num_labels(); ++id) {
41 label_counts.push_back(
static_cast<long>(data.num_positives(
label_id_t{id})));
44 std::sort(begin(label_counts), end(label_counts));
47 result[
"num-labels"] = data.num_labels();
48 result[
"num-instances"] = data.num_examples();
49 result[
"most-frequent"] = label_counts.back();
50 result[
"least-frequent"] = label_counts.front();
51 result[
"intra-IR-min"] = double(data.num_examples()) / double(std::max(1l, label_counts.back()));
52 result[
"intra-IR-max"] = double(data.num_examples()) / double(std::max(1l, label_counts.front()));
53 result[
"inter-IR"] = double(label_counts.back()) / double(std::max(1l, label_counts.front()));
56 std::vector<long> cumulative;
57 std::partial_sum(label_counts.rbegin(), label_counts.rend(), std::back_inserter(cumulative));
59 std::cout << cumulative[0] <<
" " << cumulative[1] <<
" " << cumulative[cumulative.size() - 1] <<
"\n";
60 for(
int i = 0; i <
ssize(cumulative); ++i) {
61 if(cumulative[i] / target >= cumulative.back() / 100) {
63 result[
"cumulative-rel-" +
std::to_string(target)] = 100.0 * double(i) / double(data.num_labels());
68 result[
"obesity"] =
obesity(label_counts, 10000);
70 std::fstream result_file(OutputFile, std::fstream::out);
71 result_file << std::setw(4) << result <<
"\n";
77 double obesity(
const std::vector<long>& values,
int num_samples) {
79 std::uniform_int_distribution<long> dist(0,
ssize(values) - 1);
80 std::array<long, 4> sample{};
82 for(
int i = 0; i < num_samples; ++i) {
83 for(
auto& s : sample) s = dist(rng);
84 std::sort(begin(sample), end(sample));
85 if(values[sample[0]] + values[sample[3]] > values[sample[1]] + values[sample[2]]) {
90 return double(larger) / double(num_samples / 100);
Strong typedef for an int to signify a label id.
double obesity(const std::vector< long > &values, int num_samples)
int main(int argc, const char **argv)
const char * to_string(WeightFormat format)
MultiLabelData read_xmc_dataset(const std::filesystem::path &source, IndexMode mode=IndexMode::ZERO_BASED)
Reads a dataset given in the extreme multilabel classification format.
@ ONE_BASED
labels and feature indices are 1, 2, ..., num
@ ZERO_BASED
labels and feature indices are 0, 1, ..., num - 1
Main namespace in which all types, classes, and functions are defined.
constexpr auto ssize(const C &c) -> std::common_type_t< std::ptrdiff_t, std::make_signed_t< decltype(c.size())>>
signed size free function. Taken from https://en.cppreference.com/w/cpp/iterator/size