python - sentiment - Construyendo una matriz de co-ocurrencia en los pandas de pitón
tweepy sentiment analysis (3)
Demostración en NumPy:
import numpy as np
np.random.seed(3) # for reproducibility
# Generate data: 5 labels, 10 examples, binary.
label_headers = ''Alice Bob Carol Dave Eve''.split('' '')
label_data = np.random.randint(0,2,(10,5)) # binary here but could be any integer.
print(''labels:/n{0}''.format(label_data))
# Compute cooccurrence matrix
cooccurrence_matrix = np.dot(label_data.transpose(),label_data)
print(''/ncooccurrence_matrix:/n{0}''.format(cooccurrence_matrix))
# Compute cooccurrence matrix in percentage
# FYI: http://stackoverflow.com/questions/19602187/numpy-divide-each-row-by-a-vector-element
# http://stackoverflow.com/questions/26248654/numpy-return-0-with-divide-by-zero/32106804#32106804
cooccurrence_matrix_diagonal = np.diagonal(cooccurrence_matrix)
with np.errstate(divide=''ignore'', invalid=''ignore''):
cooccurrence_matrix_percentage = np.nan_to_num(np.true_divide(cooccurrence_matrix, cooccurrence_matrix_diagonal[:, None]))
print(''/ncooccurrence_matrix_percentage:/n{0}''.format(cooccurrence_matrix_percentage))
Salida:
labels:
[[0 0 1 1 0]
[0 0 1 1 1]
[0 1 1 1 0]
[1 1 0 0 0]
[0 1 1 0 0]
[0 1 0 0 0]
[0 1 0 1 1]
[0 1 0 0 1]
[1 0 0 1 0]
[1 0 1 1 1]]
cooccurrence_matrix:
[[3 1 1 2 1]
[1 6 2 2 2]
[1 2 5 4 2]
[2 2 4 6 3]
[1 2 2 3 4]]
cooccurrence_matrix_percentage:
[[ 1. 0.33333333 0.33333333 0.66666667 0.33333333]
[ 0.16666667 1. 0.33333333 0.33333333 0.33333333]
[ 0.2 0.4 1. 0.8 0.4 ]
[ 0.33333333 0.33333333 0.66666667 1. 0.5 ]
[ 0.25 0.5 0.5 0.75 1. ]]
Con un mapa de calor usando matplotlib:
import numpy as np
np.random.seed(3) # for reproducibility
import matplotlib.pyplot as plt
def show_values(pc, fmt="%.2f", **kw):
''''''
Heatmap with text in each cell with matplotlib''s pyplot
Source: http://stackoverflow.com/a/25074150/395857
By HYRY
''''''
from itertools import izip
pc.update_scalarmappable()
ax = pc.get_axes()
for p, color, value in izip(pc.get_paths(), pc.get_facecolors(), pc.get_array()):
x, y = p.vertices[:-2, :].mean(0)
if np.all(color[:3] > 0.5):
color = (0.0, 0.0, 0.0)
else:
color = (1.0, 1.0, 1.0)
ax.text(x, y, fmt % value, ha="center", va="center", color=color, **kw)
def cm2inch(*tupl):
''''''
Specify figure size in centimeter in matplotlib
Source: http://stackoverflow.com/a/22787457/395857
By gns-ank
''''''
inch = 2.54
if type(tupl[0]) == tuple:
return tuple(i/inch for i in tupl[0])
else:
return tuple(i/inch for i in tupl)
def heatmap(AUC, title, xlabel, ylabel, xticklabels, yticklabels):
''''''
Inspired by:
- http://stackoverflow.com/a/16124677/395857
- http://stackoverflow.com/a/25074150/395857
''''''
# Plot it out
fig, ax = plt.subplots()
c = ax.pcolor(AUC, edgecolors=''k'', linestyle= ''dashed'', linewidths=0.2, cmap=''RdBu'', vmin=0.0, vmax=1.0)
# put the major ticks at the middle of each cell
ax.set_yticks(np.arange(AUC.shape[0]) + 0.5, minor=False)
ax.set_xticks(np.arange(AUC.shape[1]) + 0.5, minor=False)
# set tick labels
#ax.set_xticklabels(np.arange(1,AUC.shape[1]+1), minor=False)
ax.set_xticklabels(xticklabels, minor=False)
ax.set_yticklabels(yticklabels, minor=False)
# set title and x/y labels
plt.title(title)
plt.xlabel(xlabel)
plt.ylabel(ylabel)
# Remove last blank column
plt.xlim( (0, AUC.shape[1]) )
# Turn off all the ticks
ax = plt.gca()
for t in ax.xaxis.get_major_ticks():
t.tick1On = False
t.tick2On = False
for t in ax.yaxis.get_major_ticks():
t.tick1On = False
t.tick2On = False
# Add color bar
plt.colorbar(c)
# Add text in each cell
show_values(c)
# Proper orientation (origin at the top left instead of bottom left)
ax.invert_yaxis()
ax.xaxis.tick_top()
# resize
fig = plt.gcf()
fig.set_size_inches(cm2inch(40, 20))
def main():
# Generate data: 5 labels, 10 examples, binary.
label_headers = ''Alice Bob Carol Dave Eve''.split('' '')
label_data = np.random.randint(0,2,(10,5)) # binary here but could be any integer.
print(''labels:/n{0}''.format(label_data))
# Compute cooccurrence matrix
cooccurrence_matrix = np.dot(label_data.transpose(),label_data)
print(''/ncooccurrence_matrix:/n{0}''.format(cooccurrence_matrix))
# Compute cooccurrence matrix in percentage
# FYI: http://stackoverflow.com/questions/19602187/numpy-divide-each-row-by-a-vector-element
# http://stackoverflow.com/questions/26248654/numpy-return-0-with-divide-by-zero/32106804#32106804
cooccurrence_matrix_diagonal = np.diagonal(cooccurrence_matrix)
with np.errstate(divide=''ignore'', invalid=''ignore''):
cooccurrence_matrix_percentage = np.nan_to_num(np.true_divide(cooccurrence_matrix, cooccurrence_matrix_diagonal[:, None]))
print(''/ncooccurrence_matrix_percentage:/n{0}''.format(cooccurrence_matrix_percentage))
# Add count in labels
label_header_with_count = [ ''{0} ({1})''.format(label_header, cooccurrence_matrix_diagonal[label_number]) for label_number, label_header in enumerate(label_headers)]
print(''/nlabel_header_with_count: {0}''.format(label_header_with_count))
# Plotting
x_axis_size = cooccurrence_matrix_percentage.shape[0]
y_axis_size = cooccurrence_matrix_percentage.shape[1]
title = "Co-occurrence matrix/n"
xlabel= ''''#"Labels"
ylabel= ''''#"Labels"
xticklabels = label_header_with_count
yticklabels = label_header_with_count
heatmap(cooccurrence_matrix_percentage, title, xlabel, ylabel, xticklabels, yticklabels)
plt.savefig(''image_output.png'', dpi=300, format=''png'', bbox_inches=''tight'') # use format=''svg'' or ''pdf'' for vectorial pictures
#plt.show()
if __name__ == "__main__":
main()
#cProfile.run(''main()'') # if you want to do some profiling
(PD: una visualización ordenada de una matriz de co-ocurrencia en D3.js )
Sé cómo hacer esto en R Pero, ¿hay alguna función en los pandas que transforme un marco de datos en una matriz de coexistencia de nxn que contenga los recuentos de dos aspectos simultáneos?
Por ejemplo, una matriz df:
import pandas as pd
df = pd.DataFrame({''TFD'' : [''AA'', ''SL'', ''BB'', ''D0'', ''Dk'', ''FF''],
''Snack'' : [''1'', ''0'', ''1'', ''1'', ''0'', ''0''],
''Trans'' : [''1'', ''1'', ''1'', ''0'', ''0'', ''1''],
''Dop'' : [''1'', ''0'', ''1'', ''0'', ''1'', ''1'']}).set_index(''TFD'')
print df
>>>
Dop Snack Trans
TFD
AA 1 1 1
SL 0 0 1
BB 1 1 1
D0 0 1 0
Dk 1 0 0
FF 1 0 1
[6 rows x 3 columns]
cedería:
Dop Snack Trans
Dop 0 2 3
Snack 2 0 2
Trans 3 2 0
Dado que la matriz se refleja en la diagonal, supongo que habría una forma de optimizar el código.
En caso de que tenga un corpus más grande y una matriz de frecuencia de términos, usar la multiplicación de matriz dispersa podría ser más eficiente. Utilizo el mismo truco de multiplicación de matrices que se refiere a algo
en esta página.
import scipy.sparse as sp
X = sp.csr_matrix(df.astype(int).values) # convert dataframe to sparse matrix
Xc = X.T * X # multiply sparse matrix #
Xc.setdiag(0) # reset diagonal
print(Xc.todense()) # to print co-occurence matrix in dense format
Xc
aquí será la matriz de co-ocurrencia en formato csr escaso
Es un álgebra lineal simple, multiplicas la matriz con su transposición (tu ejemplo contiene cadenas, no olvides convertirlas a números enteros):
>>> df_asint = df.astype(int)
>>> coocc = df_asint.T.dot(df_asint)
>>> coocc
Dop Snack Trans
Dop 4 2 3
Snack 2 3 2
Trans 3 2 4
si, como en la respuesta R, quieres restablecer la diagonal, puedes usar fill_diagonal de fill_diagonal
:
>>> import numpy as np
>>> np.fill_diagonal(coocc.values, 0)
>>> coocc
Dop Snack Trans
Dop 0 2 3
Snack 2 0 2
Trans 3 2 0