请找管理员授权。/funnyscript/edit_node_item.php
# -*- coding: utf-8 -*- import csv import copy import numpy as np import pandas as pd import sys import socket import matplotlib as mpl mpl.use('Agg') import matplotlib.pyplot as plt from sklearn import svm, datasets from sklearn.metrics import roc_curve, auc ###计算roc和auc #from sklearn import cross_validation def Chi2(df, total_col, good_col,overallRate): ''' #此函数计算卡方值 :df dataFrame :total_col 每个值得总数量 :bad_col 每个值的坏数据数量 :overallRate 坏数据的占比 : return 卡方值 ''' df2=df.copy() df2['expected']=df[total_col].apply(lambda x: x*overallRate) combined=zip(df2['expected'], df[total_col]-df2[good_col]) chi=[(i[0]-i[1])**2/i[0] for i in combined] chi2=sum(chi) return chi2 #基于卡方阈值卡方分箱,有个缺点,不好控制分箱个数。 def ChiMerge_MinChisq(df, col, target, confidenceVal=3.841): ''' #此函数是以卡方阈值作为终止条件进行分箱 : df dataFrame : col 被分箱的特征 : target 目标值,是0,1格式 : confidenceVal 阈值,自由度为1, 自信度为0.95时,卡方阈值为3.841 : return 分箱。 这里有个问题,卡方分箱对分箱的数量没有限制,这样子会导致最后分箱的结果是分箱太细。 ''' #对待分箱特征值进行去重 colLevels=set(df[col]) #count是求得数据条数 total=df.groupby([col])[target].count() total=pd.DataFrame({'total':total}) #好的样本是 1 #sum是求得特征值的和 #注意这里的target必须是0,1。要不然这样求 good 的数据条数,就没有意义,并且bad是1,good是0。 good=df.groupby([col])[target].sum() good=pd.DataFrame({'good':good}) #对数据进行合并,求出col,每个值的出现次数(total,good) regroup=total.merge(good, left_index=True, right_index=True, how='left') regroup.reset_index(level=0, inplace=True) #求出整的数据条数 N=sum(regroup['total']) #求出黑名单的数据条数 G=sum(regroup['good']) overallRate=(N-G)*1.0/N #对待分箱的特征值进行排序 colLevels=sorted(list(colLevels)) groupIntervals=[[i] for i in colLevels] groupNum=len(groupIntervals) while(1): if len(groupIntervals) == 1: break chisqList=[] for interval in groupIntervals: df2=regroup.loc[regroup[col].isin(interval)] chisq=Chi2(df2, 'total', 'good', overallRate) chisqList.append(chisq) var_min=min(chisqList) print("var_min"+str(var_min)) min_position=chisqList.index(var_min) if min(chisqList) >= confidenceVal: break if min_position==0: combinedPosition=1 elif min_position== groupNum-1: combinedPosition=min_position-1 else: if chisqList[min_position-1]<=chisqList[min_position + 1]: combinedPosition=min_position-1 else: combinedPosition=min_position+1 groupIntervals[min_position]=groupIntervals[min_position]+groupIntervals[combinedPosition] groupIntervals.remove(groupIntervals[combinedPosition]) groupNum=len(groupIntervals) return groupIntervals #最大分箱数分箱 def ChiMerge_MaxInterval_Original(df, col, target,max_interval,rate): ''' : df dataframe : col 要被分项的特征 : target 目标值 0,1 值 : max_interval 最大箱数 :return 箱体 ''' print("len(df.index)") print(str(len(df.index))) print("rate") print(rate) min_count=len(df.index)*rate print("min_count") print(min_count) colLevels=set(df[col]) colLevels=sorted(list(colLevels)) N_distinct=len(colLevels) if N_distinct <= max_interval: print("the row is cann't be less than interval numbers N_distinct="+str(N_distinct)) return colLevels #[:-1] else: total=df.groupby([col])[target].count() total=pd.DataFrame({'total':total}) good=df.groupby([col])[target].sum() good=pd.DataFrame({'good':good}) regroup=total.merge(good, left_index=True, right_index=True, how='left') regroup.reset_index(level=0, inplace=True) N=sum(regroup['total']) G=sum(regroup['good']) overallRate=(N-G)*1.0/N groupIntervals=[[i] for i in colLevels] groupNum=len(groupIntervals) while(groupNum>max_interval): #send_msg(pSocket,"",to,"msg_cut_status",str(groupNum)); print("groupNum:"+str(groupNum)) chisqList=[] list_size=[] for interval in groupIntervals: df2=regroup.loc[regroup[col].isin(interval)] chisq=Chi2(df2,'total','good',overallRate) chisqList.append(chisq) df3=df[df[col].isin(interval)] size=len(df3.index) #print("size="+str(size)) list_size.append(size) my_min=min(list_size) #print("min="+str(my_min)) #print("min_count="+str(min_count)) #if (my_min>min_count): # break min_position=chisqList.index(min(chisqList)) if min_position==0: combinedPosition=1 elif min_position==groupNum-1: combinedPosition=min_position-1 else: if chisqList[min_position-1]<=chisqList[min_position + 1]: combinedPosition=min_position-1 else: combinedPosition=min_position+1 #合并箱体 groupIntervals[min_position]=groupIntervals[min_position]+groupIntervals[combinedPosition] groupIntervals.remove(groupIntervals[combinedPosition]) groupNum=len(groupIntervals) groupIntervals=[sorted(i) for i in groupIntervals] print("count="+str(groupNum)) print(groupIntervals) #cutOffPoints=[i[-1] for i in groupIntervals[:-1]] cutOffPoints1=[i[0] for i in groupIntervals] cutOffPoints2=[groupIntervals[-1][-1]] print(cutOffPoints1) print(cutOffPoints2) return cutOffPoints1+cutOffPoints2 def send_msg(socket,user,to,msg_type,msg): MESSAGE = 'm:<s>:{"return_cmd":"chat_return","from":"'+user+'","id":"1","oid":"0","to":"'+to+'","type":"'+msg_type+'","message":"'+msg+'","token":""}:</s>\r\n' socket.send(MESSAGE.encode('utf-8')) print("send="+msg) file="" count=0 col=0 col_y=0 max_col=0 pSocket = socket.socket(socket.AF_INET, socket.SOCK_STREAM) if len(sys.argv)>6: to=sys.argv[1] file=sys.argv[2] count=int(sys.argv[3]) max_col=int(sys.argv[4]) col=sys.argv[5] col_y=sys.argv[6] rate=float(sys.argv[7]) TCP_IP = '127.0.0.1' TCP_PORT = 6000 #pSocket.connect((TCP_IP, TCP_PORT)) a=["c"+str(i+1) for i in range(count)] data1 = pd.read_csv(file,header=None,names=a) data1 = data1[data1[col]!=-999] b=ChiMerge_MaxInterval_Original(data1,col,col_y,max_col,rate) b[-1]=b[-1]+1 line=",".join(str(x) for x in b) #line=line+","+str(b[-1]+1) #send_msg(pSocket,"",to,"msg_cut",line); print("result="+line) else: print("参数不足!")
ID=6018 cut_by_chi.py 卡方分箱
保存
# -*- coding: utf-8 -*- import csv import copy import numpy as np import pandas as pd import sys import socket import matplotlib as mpl mpl.use('Agg') import matplotlib.pyplot as plt from sklearn import svm, datasets from sklearn.metrics import roc_curve, auc ###计算roc和auc #from sklearn import cross_validation def Chi2(df, total_col, good_col,overallRate): ''' #此函数计算卡方值 :df dataFrame :total_col 每个值得总数量 :bad_col 每个值的坏数据数量 :overallRate 坏数据的占比 : return 卡方值 ''' df2=df.copy() df2['expected']=df[total_col].apply(lambda x: x*overallRate) combined=zip(df2['expected'], df[total_col]-df2[good_col]) chi=[(i[0]-i[1])**2/i[0] for i in combined] chi2=sum(chi) return chi2 #基于卡方阈值卡方分箱,有个缺点,不好控制分箱个数。 def ChiMerge_MinChisq(df, col, target, confidenceVal=3.841): ''' #此函数是以卡方阈值作为终止条件进行分箱 : df dataFrame : col 被分箱的特征 : target 目标值,是0,1格式 : confidenceVal 阈值,自由度为1, 自信度为0.95时,卡方阈值为3.841 : return 分箱。 这里有个问题,卡方分箱对分箱的数量没有限制,这样子会导致最后分箱的结果是分箱太细。 ''' #对待分箱特征值进行去重 colLevels=set(df[col]) #count是求得数据条数 total=df.groupby([col])[target].count() total=pd.DataFrame({'total':total}) #好的样本是 1 #sum是求得特征值的和 #注意这里的target必须是0,1。要不然这样求 good 的数据条数,就没有意义,并且bad是1,good是0。 good=df.groupby([col])[target].sum() good=pd.DataFrame({'good':good}) #对数据进行合并,求出col,每个值的出现次数(total,good) regroup=total.merge(good, left_index=True, right_index=True, how='left') regroup.reset_index(level=0, inplace=True) #求出整的数据条数 N=sum(regroup['total']) #求出黑名单的数据条数 G=sum(regroup['good']) overallRate=(N-G)*1.0/N #对待分箱的特征值进行排序 colLevels=sorted(list(colLevels)) groupIntervals=[[i] for i in colLevels] groupNum=len(groupIntervals) while(1): if len(groupIntervals) == 1: break chisqList=[] for interval in groupIntervals: df2=regroup.loc[regroup[col].isin(interval)] chisq=Chi2(df2, 'total', 'good', overallRate) chisqList.append(chisq) var_min=min(chisqList) print("var_min"+str(var_min)) min_position=chisqList.index(var_min) if min(chisqList) >= confidenceVal: break if min_position==0: combinedPosition=1 elif min_position== groupNum-1: combinedPosition=min_position-1 else: if chisqList[min_position-1]<=chisqList[min_position + 1]: combinedPosition=min_position-1 else: combinedPosition=min_position+1 groupIntervals[min_position]=groupIntervals[min_position]+groupIntervals[combinedPosition] groupIntervals.remove(groupIntervals[combinedPosition]) groupNum=len(groupIntervals) return groupIntervals #最大分箱数分箱 def ChiMerge_MaxInterval_Original(df, col, target,max_interval,rate): ''' : df dataframe : col 要被分项的特征 : target 目标值 0,1 值 : max_interval 最大箱数 :return 箱体 ''' print("len(df.index)") print(str(len(df.index))) print("rate") print(rate) min_count=len(df.index)*rate print("min_count") print(min_count) colLevels=set(df[col]) colLevels=sorted(list(colLevels)) N_distinct=len(colLevels) if N_distinct <= max_interval: print("the row is cann't be less than interval numbers N_distinct="+str(N_distinct)) return colLevels #[:-1] else: total=df.groupby([col])[target].count() total=pd.DataFrame({'total':total}) good=df.groupby([col])[target].sum() good=pd.DataFrame({'good':good}) regroup=total.merge(good, left_index=True, right_index=True, how='left') regroup.reset_index(level=0, inplace=True) N=sum(regroup['total']) G=sum(regroup['good']) overallRate=(N-G)*1.0/N groupIntervals=[[i] for i in colLevels] groupNum=len(groupIntervals) while(groupNum>max_interval): #send_msg(pSocket,"",to,"msg_cut_status",str(groupNum)); print("groupNum:"+str(groupNum)) chisqList=[] list_size=[] for interval in groupIntervals: df2=regroup.loc[regroup[col].isin(interval)] chisq=Chi2(df2,'total','good',overallRate) chisqList.append(chisq) df3=df[df[col].isin(interval)] size=len(df3.index) #print("size="+str(size)) list_size.append(size) my_min=min(list_size) #print("min="+str(my_min)) #print("min_count="+str(min_count)) #if (my_min>min_count): # break min_position=chisqList.index(min(chisqList)) if min_position==0: combinedPosition=1 elif min_position==groupNum-1: combinedPosition=min_position-1 else: if chisqList[min_position-1]<=chisqList[min_position + 1]: combinedPosition=min_position-1 else: combinedPosition=min_position+1 #合并箱体 groupIntervals[min_position]=groupIntervals[min_position]+groupIntervals[combinedPosition] groupIntervals.remove(groupIntervals[combinedPosition]) groupNum=len(groupIntervals) groupIntervals=[sorted(i) for i in groupIntervals] print("count="+str(groupNum)) print(groupIntervals) #cutOffPoints=[i[-1] for i in groupIntervals[:-1]] cutOffPoints1=[i[0] for i in groupIntervals] cutOffPoints2=[groupIntervals[-1][-1]] print(cutOffPoints1) print(cutOffPoints2) return cutOffPoints1+cutOffPoints2 def send_msg(socket,user,to,msg_type,msg): MESSAGE = 'm:<s>:{"return_cmd":"chat_return","from":"'+user+'","id":"1","oid":"0","to":"'+to+'","type":"'+msg_type+'","message":"'+msg+'","token":""}:</s>\r\n' socket.send(MESSAGE.encode('utf-8')) print("send="+msg) file="" count=0 col=0 col_y=0 max_col=0 pSocket = socket.socket(socket.AF_INET, socket.SOCK_STREAM) if len(sys.argv)>6: to=sys.argv[1] file=sys.argv[2] count=int(sys.argv[3]) max_col=int(sys.argv[4]) col=sys.argv[5] col_y=sys.argv[6] rate=float(sys.argv[7]) TCP_IP = '127.0.0.1' TCP_PORT = 6000 #pSocket.connect((TCP_IP, TCP_PORT)) a=["c"+str(i+1) for i in range(count)] data1 = pd.read_csv(file,header=None,names=a) data1 = data1[data1[col]!=-999] b=ChiMerge_MaxInterval_Original(data1,col,col_y,max_col,rate) b[-1]=b[-1]+1 line=",".join(str(x) for x in b) #line=line+","+str(b[-1]+1) #send_msg(pSocket,"",to,"msg_cut",line); print("result="+line) else: print("参数不足!")