python - sentiment - Construyendo una matriz de co-ocurrencia en los pandas de pitón

tweepy sentiment analysis (3)

Demostración en NumPy:

import numpy as np np.random.seed(3) # for reproducibility # Generate data: 5 labels, 10 examples, binary. label_headers = ''Alice Bob Carol Dave Eve''.split('' '') label_data = np.random.randint(0,2,(10,5)) # binary here but could be any integer. print(''labels:/n{0}''.format(label_data)) # Compute cooccurrence matrix cooccurrence_matrix = np.dot(label_data.transpose(),label_data) print(''/ncooccurrence_matrix:/n{0}''.format(cooccurrence_matrix)) # Compute cooccurrence matrix in percentage # FYI: http://stackoverflow.com/questions/19602187/numpy-divide-each-row-by-a-vector-element # http://stackoverflow.com/questions/26248654/numpy-return-0-with-divide-by-zero/32106804#32106804 cooccurrence_matrix_diagonal = np.diagonal(cooccurrence_matrix) with np.errstate(divide=''ignore'', invalid=''ignore''): cooccurrence_matrix_percentage = np.nan_to_num(np.true_divide(cooccurrence_matrix, cooccurrence_matrix_diagonal[:, None])) print(''/ncooccurrence_matrix_percentage:/n{0}''.format(cooccurrence_matrix_percentage))

Salida:

labels: [[0 0 1 1 0] [0 0 1 1 1] [0 1 1 1 0] [1 1 0 0 0] [0 1 1 0 0] [0 1 0 0 0] [0 1 0 1 1] [0 1 0 0 1] [1 0 0 1 0] [1 0 1 1 1]] cooccurrence_matrix: [[3 1 1 2 1] [1 6 2 2 2] [1 2 5 4 2] [2 2 4 6 3] [1 2 2 3 4]] cooccurrence_matrix_percentage: [[ 1. 0.33333333 0.33333333 0.66666667 0.33333333] [ 0.16666667 1. 0.33333333 0.33333333 0.33333333] [ 0.2 0.4 1. 0.8 0.4 ] [ 0.33333333 0.33333333 0.66666667 1. 0.5 ] [ 0.25 0.5 0.5 0.75 1. ]]

Con un mapa de calor usando matplotlib:

import numpy as np np.random.seed(3) # for reproducibility import matplotlib.pyplot as plt def show_values(pc, fmt="%.2f", **kw): '''''' Heatmap with text in each cell with matplotlib''s pyplot Source: http://stackoverflow.com/a/25074150/395857 By HYRY '''''' from itertools import izip pc.update_scalarmappable() ax = pc.get_axes() for p, color, value in izip(pc.get_paths(), pc.get_facecolors(), pc.get_array()): x, y = p.vertices[:-2, :].mean(0) if np.all(color[:3] > 0.5): color = (0.0, 0.0, 0.0) else: color = (1.0, 1.0, 1.0) ax.text(x, y, fmt % value, ha="center", va="center", color=color, **kw) def cm2inch(*tupl): '''''' Specify figure size in centimeter in matplotlib Source: http://stackoverflow.com/a/22787457/395857 By gns-ank '''''' inch = 2.54 if type(tupl[0]) == tuple: return tuple(i/inch for i in tupl[0]) else: return tuple(i/inch for i in tupl) def heatmap(AUC, title, xlabel, ylabel, xticklabels, yticklabels): '''''' Inspired by: - http://stackoverflow.com/a/16124677/395857 - http://stackoverflow.com/a/25074150/395857 '''''' # Plot it out fig, ax = plt.subplots() c = ax.pcolor(AUC, edgecolors=''k'', linestyle= ''dashed'', linewidths=0.2, cmap=''RdBu'', vmin=0.0, vmax=1.0) # put the major ticks at the middle of each cell ax.set_yticks(np.arange(AUC.shape[0]) + 0.5, minor=False) ax.set_xticks(np.arange(AUC.shape[1]) + 0.5, minor=False) # set tick labels #ax.set_xticklabels(np.arange(1,AUC.shape[1]+1), minor=False) ax.set_xticklabels(xticklabels, minor=False) ax.set_yticklabels(yticklabels, minor=False) # set title and x/y labels plt.title(title) plt.xlabel(xlabel) plt.ylabel(ylabel) # Remove last blank column plt.xlim( (0, AUC.shape[1]) ) # Turn off all the ticks ax = plt.gca() for t in ax.xaxis.get_major_ticks(): t.tick1On = False t.tick2On = False for t in ax.yaxis.get_major_ticks(): t.tick1On = False t.tick2On = False # Add color bar plt.colorbar(c) # Add text in each cell show_values(c) # Proper orientation (origin at the top left instead of bottom left) ax.invert_yaxis() ax.xaxis.tick_top() # resize fig = plt.gcf() fig.set_size_inches(cm2inch(40, 20)) def main(): # Generate data: 5 labels, 10 examples, binary. label_headers = ''Alice Bob Carol Dave Eve''.split('' '') label_data = np.random.randint(0,2,(10,5)) # binary here but could be any integer. print(''labels:/n{0}''.format(label_data)) # Compute cooccurrence matrix cooccurrence_matrix = np.dot(label_data.transpose(),label_data) print(''/ncooccurrence_matrix:/n{0}''.format(cooccurrence_matrix)) # Compute cooccurrence matrix in percentage # FYI: http://stackoverflow.com/questions/19602187/numpy-divide-each-row-by-a-vector-element # http://stackoverflow.com/questions/26248654/numpy-return-0-with-divide-by-zero/32106804#32106804 cooccurrence_matrix_diagonal = np.diagonal(cooccurrence_matrix) with np.errstate(divide=''ignore'', invalid=''ignore''): cooccurrence_matrix_percentage = np.nan_to_num(np.true_divide(cooccurrence_matrix, cooccurrence_matrix_diagonal[:, None])) print(''/ncooccurrence_matrix_percentage:/n{0}''.format(cooccurrence_matrix_percentage)) # Add count in labels label_header_with_count = [ ''{0} ({1})''.format(label_header, cooccurrence_matrix_diagonal[label_number]) for label_number, label_header in enumerate(label_headers)] print(''/nlabel_header_with_count: {0}''.format(label_header_with_count)) # Plotting x_axis_size = cooccurrence_matrix_percentage.shape[0] y_axis_size = cooccurrence_matrix_percentage.shape[1] title = "Co-occurrence matrix/n" xlabel= ''''#"Labels" ylabel= ''''#"Labels" xticklabels = label_header_with_count yticklabels = label_header_with_count heatmap(cooccurrence_matrix_percentage, title, xlabel, ylabel, xticklabels, yticklabels) plt.savefig(''image_output.png'', dpi=300, format=''png'', bbox_inches=''tight'') # use format=''svg'' or ''pdf'' for vectorial pictures #plt.show() if __name__ == "__main__": main() #cProfile.run(''main()'') # if you want to do some profiling

(PD: una visualización ordenada de una matriz de co-ocurrencia en D3.js )

Sé cómo hacer esto en R Pero, ¿hay alguna función en los pandas que transforme un marco de datos en una matriz de coexistencia de nxn que contenga los recuentos de dos aspectos simultáneos?

Por ejemplo, una matriz df:

import pandas as pd df = pd.DataFrame({''TFD'' : [''AA'', ''SL'', ''BB'', ''D0'', ''Dk'', ''FF''], ''Snack'' : [''1'', ''0'', ''1'', ''1'', ''0'', ''0''], ''Trans'' : [''1'', ''1'', ''1'', ''0'', ''0'', ''1''], ''Dop'' : [''1'', ''0'', ''1'', ''0'', ''1'', ''1'']}).set_index(''TFD'') print df >>> Dop Snack Trans TFD AA 1 1 1 SL 0 0 1 BB 1 1 1 D0 0 1 0 Dk 1 0 0 FF 1 0 1 [6 rows x 3 columns]

cedería:

Dop Snack Trans Dop 0 2 3 Snack 2 0 2 Trans 3 2 0

Dado que la matriz se refleja en la diagonal, supongo que habría una forma de optimizar el código.

En caso de que tenga un corpus más grande y una matriz de frecuencia de términos, usar la multiplicación de matriz dispersa podría ser más eficiente. Utilizo el mismo truco de multiplicación de matrices que se refiere a algo en esta página.

import scipy.sparse as sp X = sp.csr_matrix(df.astype(int).values) # convert dataframe to sparse matrix Xc = X.T * X # multiply sparse matrix # Xc.setdiag(0) # reset diagonal print(Xc.todense()) # to print co-occurence matrix in dense format

Xc aquí será la matriz de co-ocurrencia en formato csr escaso

Es un álgebra lineal simple, multiplicas la matriz con su transposición (tu ejemplo contiene cadenas, no olvides convertirlas a números enteros):

>>> df_asint = df.astype(int) >>> coocc = df_asint.T.dot(df_asint) >>> coocc Dop Snack Trans Dop 4 2 3 Snack 2 3 2 Trans 3 2 4

si, como en la respuesta R, quieres restablecer la diagonal, puedes usar fill_diagonal de fill_diagonal :

>>> import numpy as np >>> np.fill_diagonal(coocc.values, 0) >>> coocc Dop Snack Trans Dop 0 2 3 Snack 2 0 2 Trans 3 2 0