请找管理员授权。/funnyscript/edit_node_item.php
# -*- coding: utf-8 -*- import csv import copy import numpy as np import pandas as pd import sys import socket import matplotlib as mpl mpl.use('Agg') import matplotlib.pyplot as plt from sklearn import svm, datasets from sklearn.metrics import roc_curve, auc ###计算roc和auc #from sklearn import cross_validation import pandas as pd import numpy as np min=0 max=0 def calc_min_max(sample_set, var): ''' 计算相邻评分的中位数,以便进行决策树二元切分 param sample_set: 待切分样本 param var: 分割变量名称 ''' global min global max var_list = list(np.unique(sample_set[var])) min=var_list[0] print("min="+str(min)) max=var_list[len(var_list)-1]+1 print("max="+str(max)) def calc_score_median(sample_set, var): ''' 计算相邻评分的中位数,以便进行决策树二元切分 param sample_set: 待切分样本 param var: 分割变量名称 ''' var_list = list(np.unique(sample_set[var])) var_median_list = [] for i in range(len(var_list) -1): var_median = (var_list[i] + var_list[i+1]) / 2 var_median_list.append(var_median) return var_median_list def choose_best_split(sample_set, col, col_y, min_sample): ''' 使用CART分类决策树选择最好的样本切分点 返回切分点 param sample_set: 待切分样本 param var: 分割变量名称 param min_sample: 待切分样本的最小样本量(限制条件) ''' # 根据样本评分计算相邻不同分数的中间值 score_median_list = calc_score_median(sample_set, col) median_len = len(score_median_list) sample_cnt = sample_set.shape[0] sample1_cnt = sum(sample_set[col_y]) sample0_cnt = sample_cnt- sample1_cnt if (sample_cnt==0): print("=================") print("=================") print(sample_set) print(score_median_list) Gini = 1 - np.square(sample1_cnt / sample_cnt) - np.square(sample0_cnt / sample_cnt) print("Gini="+str(Gini)) bestGini = 0.0 bestSplit_point = 0.0 bestSplit_position = 0.0 for i in range(median_len): #print(str(i)+":"+str(median_len)) left = sample_set[sample_set[col] < score_median_list[i]] right = sample_set[sample_set[col] > score_median_list[i]] left_cnt = left.shape[0]; right_cnt = right.shape[0] left1_cnt = sum(left[col_y]); right1_cnt = sum(right[col_y]) left0_cnt = left_cnt - left1_cnt; right0_cnt = right_cnt - right1_cnt left_ratio = left_cnt / sample_cnt; right_ratio = right_cnt / sample_cnt if left_cnt < min_sample or right_cnt < min_sample: #print(str(left_cnt)+":"+str(right_cnt)+":"+str(min_sample)) continue print(str(left_ratio)+":"+str(right_ratio)) print("l_c,r_c,min_s==="+str(left_cnt)+":"+str(right_cnt)+":"+str(min_sample)) Gini_left = 1 - np.square(left1_cnt / left_cnt) - np.square(left0_cnt / left_cnt) Gini_right = 1 - np.square(right1_cnt / right_cnt) - np.square(right0_cnt / right_cnt) Gini_temp = Gini - (left_ratio * Gini_left + right_ratio * Gini_right) if Gini_temp > bestGini: print("Gini_temp=="+str(Gini_temp)) bestGini = Gini_temp; bestSplit_point = score_median_list[i] bestSplit_position = i print("position="+str(bestSplit_point)+"---"+str(bestSplit_position)) print(str(Gini_temp)+"--"+str(bestGini)+"--"+str(bestSplit_position)) else: continue Gini = Gini - bestGini return bestSplit_point, bestSplit_position def bining_data_split(sample_set, col, col_y, min_sample, split_list): ''' 划分数据找到最优分割点list param sample_set: 待切分样本 param var: 分割变量名称 param min_sample: 待切分样本的最小样本量(限制条件) param split_list: 最优分割点list ''' split, position = choose_best_split(sample_set, col, col_y, min_sample) if split != 0.0: split_list.append(split) # 根据分割点划分数据集,继续进行划分 print("split==="+str(split)+",pos="+str(position)) sample_set_left = sample_set[sample_set[col] < split] sample_set_right = sample_set[sample_set[col] > split] # 如果左子树样本量超过2倍最小样本量,且分割点不是第一个分割点,则切分左子树 if len(sample_set_left) >= min_sample * 2 and len(sample_set_right)>min_sample: # and position not in [0.0, 1.0]: print("split left,"+str(len(sample_set_right))) bining_data_split(sample_set_left, col, col_y, min_sample, split_list) else: None # 如果右子树样本量超过2倍最小样本量,且分割点不是最后一个分割点,则切分右子树 if len(sample_set_right) >= min_sample * 2 and len(sample_set_left)>min_sample: # and position not in [0.0, 1.0]: print("split right,"+str(len(sample_set_left))) bining_data_split(sample_set_right, col, col_y, min_sample, split_list) else: None def get_bestsplit_list(sample_set,min_sample_rate,col,col_y): ''' 根据分箱得到最优分割点list param sample_set: 待切分样本 param var: 分割变量名称 ''' # 计算最小样本阈值(终止条件) min_df = sample_set.shape[0] * min_sample_rate #0.05 split_list = [] # 计算第一个和最后一个分割点 bining_data_split(sample_set, col, col_y, min_df, split_list) return split_list def send_msg(socket,user,to,msg_type,msg): MESSAGE = 'm:<s>:{"return_cmd":"chat_return","from":"'+user+'","id":"1","oid":"0","to":"'+to+'","type":"'+msg_type+'","message":"'+msg+'","token":""}:</s>\r\n' socket.send(MESSAGE.encode('utf-8')) print("send="+msg) file="" count=0 col=0 col_y=0 max_col=0 pSocket = socket.socket(socket.AF_INET, socket.SOCK_STREAM) if len(sys.argv)>6: to=sys.argv[1] file=sys.argv[2] count=int(sys.argv[3]) min_sample_rate=float(sys.argv[4]) col=sys.argv[5] col_y=sys.argv[6] print("min rate="+str(min_sample_rate)) TCP_IP = '127.0.0.1' TCP_PORT = 6000 #pSocket.connect((TCP_IP, TCP_PORT)) a=["c"+str(i+1) for i in range(count)] data1 = pd.read_csv(file,header=None,names=a) data1 = data1[data1[col]!=-999] calc_min_max(data1, col) b=get_bestsplit_list(data1,min_sample_rate,col,col_y) print("min="+str(min)) b.append(min) print("max="+str(max)) b.append(max) b.sort() #b=ChiMerge_MaxInterval_Original(data1,col,col_y,max_col) line=",".join(str(x) for x in b) #send_msg(pSocket,"",to,"msg_cut",line); print("result="+line) else: print("参数不足!")
ID=6019 cut_by_best_gini.py 最优分箱
保存
# -*- coding: utf-8 -*- import csv import copy import numpy as np import pandas as pd import sys import socket import matplotlib as mpl mpl.use('Agg') import matplotlib.pyplot as plt from sklearn import svm, datasets from sklearn.metrics import roc_curve, auc ###计算roc和auc #from sklearn import cross_validation import pandas as pd import numpy as np min=0 max=0 def calc_min_max(sample_set, var): ''' 计算相邻评分的中位数,以便进行决策树二元切分 param sample_set: 待切分样本 param var: 分割变量名称 ''' global min global max var_list = list(np.unique(sample_set[var])) min=var_list[0] print("min="+str(min)) max=var_list[len(var_list)-1]+1 print("max="+str(max)) def calc_score_median(sample_set, var): ''' 计算相邻评分的中位数,以便进行决策树二元切分 param sample_set: 待切分样本 param var: 分割变量名称 ''' var_list = list(np.unique(sample_set[var])) var_median_list = [] for i in range(len(var_list) -1): var_median = (var_list[i] + var_list[i+1]) / 2 var_median_list.append(var_median) return var_median_list def choose_best_split(sample_set, col, col_y, min_sample): ''' 使用CART分类决策树选择最好的样本切分点 返回切分点 param sample_set: 待切分样本 param var: 分割变量名称 param min_sample: 待切分样本的最小样本量(限制条件) ''' # 根据样本评分计算相邻不同分数的中间值 score_median_list = calc_score_median(sample_set, col) median_len = len(score_median_list) sample_cnt = sample_set.shape[0] sample1_cnt = sum(sample_set[col_y]) sample0_cnt = sample_cnt- sample1_cnt if (sample_cnt==0): print("=================") print("=================") print(sample_set) print(score_median_list) Gini = 1 - np.square(sample1_cnt / sample_cnt) - np.square(sample0_cnt / sample_cnt) print("Gini="+str(Gini)) bestGini = 0.0 bestSplit_point = 0.0 bestSplit_position = 0.0 for i in range(median_len): #print(str(i)+":"+str(median_len)) left = sample_set[sample_set[col] < score_median_list[i]] right = sample_set[sample_set[col] > score_median_list[i]] left_cnt = left.shape[0]; right_cnt = right.shape[0] left1_cnt = sum(left[col_y]); right1_cnt = sum(right[col_y]) left0_cnt = left_cnt - left1_cnt; right0_cnt = right_cnt - right1_cnt left_ratio = left_cnt / sample_cnt; right_ratio = right_cnt / sample_cnt if left_cnt < min_sample or right_cnt < min_sample: #print(str(left_cnt)+":"+str(right_cnt)+":"+str(min_sample)) continue print(str(left_ratio)+":"+str(right_ratio)) print("l_c,r_c,min_s==="+str(left_cnt)+":"+str(right_cnt)+":"+str(min_sample)) Gini_left = 1 - np.square(left1_cnt / left_cnt) - np.square(left0_cnt / left_cnt) Gini_right = 1 - np.square(right1_cnt / right_cnt) - np.square(right0_cnt / right_cnt) Gini_temp = Gini - (left_ratio * Gini_left + right_ratio * Gini_right) if Gini_temp > bestGini: print("Gini_temp=="+str(Gini_temp)) bestGini = Gini_temp; bestSplit_point = score_median_list[i] bestSplit_position = i print("position="+str(bestSplit_point)+"---"+str(bestSplit_position)) print(str(Gini_temp)+"--"+str(bestGini)+"--"+str(bestSplit_position)) else: continue Gini = Gini - bestGini return bestSplit_point, bestSplit_position def bining_data_split(sample_set, col, col_y, min_sample, split_list): ''' 划分数据找到最优分割点list param sample_set: 待切分样本 param var: 分割变量名称 param min_sample: 待切分样本的最小样本量(限制条件) param split_list: 最优分割点list ''' split, position = choose_best_split(sample_set, col, col_y, min_sample) if split != 0.0: split_list.append(split) # 根据分割点划分数据集,继续进行划分 print("split==="+str(split)+",pos="+str(position)) sample_set_left = sample_set[sample_set[col] < split] sample_set_right = sample_set[sample_set[col] > split] # 如果左子树样本量超过2倍最小样本量,且分割点不是第一个分割点,则切分左子树 if len(sample_set_left) >= min_sample * 2 and len(sample_set_right)>min_sample: # and position not in [0.0, 1.0]: print("split left,"+str(len(sample_set_right))) bining_data_split(sample_set_left, col, col_y, min_sample, split_list) else: None # 如果右子树样本量超过2倍最小样本量,且分割点不是最后一个分割点,则切分右子树 if len(sample_set_right) >= min_sample * 2 and len(sample_set_left)>min_sample: # and position not in [0.0, 1.0]: print("split right,"+str(len(sample_set_left))) bining_data_split(sample_set_right, col, col_y, min_sample, split_list) else: None def get_bestsplit_list(sample_set,min_sample_rate,col,col_y): ''' 根据分箱得到最优分割点list param sample_set: 待切分样本 param var: 分割变量名称 ''' # 计算最小样本阈值(终止条件) min_df = sample_set.shape[0] * min_sample_rate #0.05 split_list = [] # 计算第一个和最后一个分割点 bining_data_split(sample_set, col, col_y, min_df, split_list) return split_list def send_msg(socket,user,to,msg_type,msg): MESSAGE = 'm:<s>:{"return_cmd":"chat_return","from":"'+user+'","id":"1","oid":"0","to":"'+to+'","type":"'+msg_type+'","message":"'+msg+'","token":""}:</s>\r\n' socket.send(MESSAGE.encode('utf-8')) print("send="+msg) file="" count=0 col=0 col_y=0 max_col=0 pSocket = socket.socket(socket.AF_INET, socket.SOCK_STREAM) if len(sys.argv)>6: to=sys.argv[1] file=sys.argv[2] count=int(sys.argv[3]) min_sample_rate=float(sys.argv[4]) col=sys.argv[5] col_y=sys.argv[6] print("min rate="+str(min_sample_rate)) TCP_IP = '127.0.0.1' TCP_PORT = 6000 #pSocket.connect((TCP_IP, TCP_PORT)) a=["c"+str(i+1) for i in range(count)] data1 = pd.read_csv(file,header=None,names=a) data1 = data1[data1[col]!=-999] calc_min_max(data1, col) b=get_bestsplit_list(data1,min_sample_rate,col,col_y) print("min="+str(min)) b.append(min) print("max="+str(max)) b.append(max) b.sort() #b=ChiMerge_MaxInterval_Original(data1,col,col_y,max_col) line=",".join(str(x) for x in b) #send_msg(pSocket,"",to,"msg_cut",line); print("result="+line) else: print("参数不足!")