基于分类分布和正态分布设计一个简单分类器算法

mac2025-12-03  10

最近在看计算机视觉:模型学习与推理,第六章中介绍一种算法,设计一个简单的通用分类器。 设计思路基本如下: 写一个数据生成器,产生三种label的数据,每个label分别服从一个正态分布。label1:label2:label3=3:7:5;以同样的比例在生成的数据中采样十分之一作为测试数据集。这样可以保证,三个分类的比例不变。并将数据存储到指定位置的文件中。

代码如下:

void generate_data_for_learning() { //build a data set for classification algorithm //set number of label is three //label1->700 label2->300 label3->500 total //label1 normal distribution parameter : mu:0 var:3 //label2 normal distribution parameter : mu 3 var:1 //label3 normal distribution parameter : mu 9 var:10 //train data set number:test data set number==9:1 //set superparameters double mu1 = 0; double var1 = 3; double mu2 = 3; double var2 = 1; double mu3 = 9; double var3 = 10; int label_1_size = 700; int label_2_size = 300; int label_3_size = 500; std::random_device rd{}; std::mt19937 gen{ rd() }; std::normal_distribution<> d1{ mu1,var1 }; std::normal_distribution<> d2{ mu2,var2 }; std::normal_distribution<> d3{ mu3,var3 }; vector<std::pair<int, double>> train_data, test_data; int i = 1; //generate data for (int i = 0; i < label_1_size; i++) { if (i < 70) { test_data.push_back(std::make_pair(1, d1(gen))); } else { train_data.push_back(std::make_pair(1, d1(gen))); } } for (int i = 0; i < label_2_size; i++) { if (i < 30) { test_data.push_back(std::make_pair(2, d2(gen))); } else { train_data.push_back(std::make_pair(2, d2(gen))); } } for (int i = 0; i < label_3_size; i++) { if (i < 50) { test_data.push_back(std::make_pair(3, d3(gen))); } else { train_data.push_back(std::make_pair(3, d3(gen))); } } std::string train_file_name = "E:/projects/computer_vision_model_learning_inference/computer_vision_model_learning_inference/computer_vision_model_learning_inference/data_set/train_data.txt"; std::string test_file_name="E:/projects/computer_vision_model_learning_inference/computer_vision_model_learning_inference/computer_vision_model_learning_inference/data_set/test_data.txt"; std::fstream train_file(train_file_name,std::ios_base::out); std::fstream test_file(test_file_name,std::ios_base::out); for (int i = 0; i < 150; i++) { test_file << test_data[i].second << " " << test_data[i].first << std::endl; } for (int i = 0; i < 1350; i++) { train_file << train_data[i].second << " " << train_data[i].first << std::endl; } train_file.close(); test_file.close(); }

学习的算法流程如下: 根据本文生成的数据C++实现如下:首先是工具函数 max_id------ 返回vector中最大元素的索引; extract_data_from_string------解析生成的数据文件中的数据,分会可供机器学习算法使用的数据格式; normal_distribution_probability------生成指定参数的正态分布的概率密度值 load_data------载入数据

template<typename T> int max_id(std::vector<T> &input) { assert(input.size()>0); int index=0; T tmp=input[0]; for (int i = 0; i < input.size(); i++) { if (input[i] > tmp) { index = i; } } return index; } static pair<double,int> extract_data_from_string(string &s) { string num_s, label_s; bool trans = false; int count = 0; for (auto c : s) { if (c != ' ') { if (trans == false) num_s+=c; else label_s+=c; } else { count++; if (count == 3) trans = true; } } double num=std::stod(num_s); int label=std::stoi(label_s); return std::make_pair(num, label); } double normal_distribution_probability(double x, double mu, double var) { return (1.0 / ((sqrt(2.0 * pi)*(sqrt(var)))))*exp(-(x - mu)*(x - mu) / (2.0 * var)); } void load_data(vector<std::pair<double, int>> &train_data, vector<std::pair<double, int>> &test_data) { std::ifstream train_file("E:/projects/computer_vision_model_learning_inference/computer_vision_model_learning_inference/computer_vision_model_learning_inference/data_set/train_data.txt"); std::ifstream test_file("E:/projects/computer_vision_model_learning_inference/computer_vision_model_learning_inference/computer_vision_model_learning_inference/data_set/test_data.txt"); std::pair<double, int> p; std::string s; for (int i = 0; i < 1350; i++) { std::getline(train_file, s); p = extract_data_from_string(s); train_data.push_back(p); } for (int i = 0; i < 150; i++) { std::getline(test_file, s); p = extract_data_from_string(s); test_data.push_back(p); } }

学习和预测算法实现代码如下:

void basic_generative_classifier() { vector<std::pair<double, int>> train_data, test_data; load_data(train_data, test_data); double mu1 ; double var1 ; double mu2 ; double var2 ; double mu3 ; double var3 ; double sum_mu1{}, sum_var1{}, sum_mu2{}, sum_var2{}, sum_mu3{}, sum_var3{}; //compute mean for (int i = 0; i < 630; i++) { sum_mu1 += train_data[i].first; } mu1 = sum_mu1 / 630; for (int i = 630; i < 900; i++) { sum_mu2 += train_data[i].first; } mu2 = sum_mu2 / 270; for (int i = 900; i < 1350; i++) { sum_mu3 += train_data[i].first; } mu3 = sum_mu3 / 450; //compute variance for (int i = 0; i < 630; i++) { sum_var1 += (train_data[i].first-mu1)*(train_data[i].first - mu1); } var1 = sum_var1 / 630; for (int i = 630; i < 900; i++) { sum_var2 += (train_data[i].first - mu2)*(train_data[i].first - mu2); } var2 = sum_var2 / 270; for (int i = 900; i < 1350; i++) { sum_var3 += (train_data[i].first - mu3)*(train_data[i].first - mu3); } var3 = sum_var3 / 450; double lambda1, lambda2, lambda3; lambda1 = 630.0 / 1350.0; lambda2 = 270.0 / 1350.0; lambda3 = 450.0 / 1350.0; vector<vector<double>> max_likelihood_probability_vector; for (int i = 0; i < 150; i++) { double x = test_data[i].first; vector<double> l; for (int j = 0; j < 3; j++) { if (j == 0) { double p = normal_distribution_probability(x,mu1,var1); l.push_back(p); } else if (j == 1) { double p = normal_distribution_probability(x, mu2, var2); l.push_back(p); } else { double p = normal_distribution_probability(x, mu3, var3); l.push_back(p); } } max_likelihood_probability_vector.push_back(l); } vector<int> predection; for (int i = 0; i < 150; i++) { vector<double> predic; double total = 0; for (int k = 0; k < 3; k++) { if (k == 0) total += max_likelihood_probability_vector[i][k] * lambda1; else if (k == 1) total += max_likelihood_probability_vector[i][k] * lambda2; else total += max_likelihood_probability_vector[i][k] * lambda3; } for (int j = 0; j < 3; j++) { if (j == 0) { double q = max_likelihood_probability_vector[i][j] * lambda1; double f = q / total; predic.push_back((max_likelihood_probability_vector[i][j] * lambda1) / total); } else if (j == 1) { double q = max_likelihood_probability_vector[i][j] * lambda2; double f = q / total; predic.push_back((max_likelihood_probability_vector[i][j] * lambda2) / total); } else { double q = max_likelihood_probability_vector[i][j] * lambda3; double f = q / total; predic.push_back((max_likelihood_probability_vector[i][j] * lambda3) / total); } } predection.push_back(max_id<double>(predic) + 1); } for (int i = 0; i < 150; i++) { cout << i + 1 << " : " << predection[i] << endl; } }

本篇博客代码的缺陷是产生的label数目是指定死的,受限于时间关系没来得及改成更通用的代码,可以接受任意的分布数量及其训练学习,数据生成,是为缺憾!

算法说明:如果label的分布差异较大的时候分类器的分类精度较高。原因也很简单,本篇博客使用的是一元正态分布数据,只有一个特征量,一旦label之间的分布存在较大重合,重合部分是很难正确分开的。

最新回复(0)