In [1]:

from IPython.core.display import display, HTML
display(HTML("<style>.container {width:90% !important;}</style>"))

In [1]:

# PowerXlwings 설명
# setpd : dataframe 셋업

import PowerXlwings as px # xlwsings 활용
import pandas as pd
import re
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np

In [10]:

from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:

x=px.tk_to_str("나는 김치 찌개")
y=px.tk_to_str("나는 크림 파스타")
z=px.tk_to_str("나는 된장 찌개")
w=px.tk_to_str("점심 안먹어")

In [5]:

ts=[y,z,w]

In [6]:

arr1=px.tkappend(x,ts)

In [34]:

arr1

Out[34]:

array(['  나는    김치    찌개  ', '  나는    크림    파스타  ', '  나는    된장    찌개  ',
       '  점심    안먹어  '], dtype='<U19')

In [11]:

tfidf_vectorizer=TfidfVectorizer(min_df=1)
tfidf_matrix=tfidf_vectorizer.fit_transform(arr1)
result=(tfidf_matrix*tfidf_matrix.T)
results=result.toarray()

In [12]:

results

Out[12]:

array([[1.        , 0.18433833, 0.50714711, 0.        ],
       [0.18433833, 1.        , 0.18433833, 0.        ],
       [0.50714711, 0.18433833, 1.        , 0.        ],
       [0.        , 0.        , 0.        , 1.        ]])

In [47]:

px.tkrank_by_TfidfVectorizer(x,ts)

Out[47]:

	text	similarity
1	나는 크림 파스타	0.184338
2	나는 된장 찌개	0.507147
3	점심 안먹어	0.000000

In [13]:

result

Out[13]:

<4x4 sparse matrix of type '<class 'numpy.float64'>'
	with 10 stored elements in Compressed Sparse Row format>

In [28]:

from numpy import dot
from numpy.linalg import norm
def cos_sim(A, B):
       return dot(A, B)/(norm(A)*norm(B))

In [35]:

doc1=np.array([0,0,1,0,0,0,1,1])
doc2=np.array([1,1,0,0,0,0,1,0])
doc3=np.array([0,0,1,0,0,1,1,0])
doc4=np.array([0,0,0,1,1,0,0,0])

In [36]:

print(cos_sim(doc1, doc2))

0.33333333333333337

In [37]:

print(cos_sim(doc1, doc3))

0.6666666666666667

In [38]:

import pandas as pd # 데이터프레임 사용을 위해
from math import log # IDF 계산을 위해

In [39]:

docs = [
  '나는 김치 찌개',
  '나는 크림 파스타',
  '나는 된장 찌개',
  '점심 안먹어'
] 
vocab = list(set(w for doc in docs for w in doc.split()))
vocab.sort()

In [40]:

N = len(docs) # 총 문서의 수

def tf(t, d):
    return d.count(t)

def idf(t):
    df = 0
    for doc in docs:
        df += t in doc
    return log(N/(df + 1))

def tfidf(t, d):
    return tf(t,d)* idf(t)

In [41]:

result = []
for i in range(N): # 각 문서에 대해서 아래 명령을 수행
    result.append([])
    d = docs[i]
    for j in range(len(vocab)):
        t = vocab[j]        
        result[-1].append(tf(t, d))

tf_ = pd.DataFrame(result, columns = vocab)
tf_

Out[41]:

	김치	나는	된장	안먹어	점심	찌개	크림	파스타
0	1	1	0	0	0	1	0	0
1	0	1	0	0	0	0	1	1
2	0	1	1	0	0	1	0	0
3	0	0	0	1	1	0	0	0

In [42]:

result = []
for j in range(len(vocab)):
    t = vocab[j]
    result.append(idf(t))

idf_ = pd.DataFrame(result, index = vocab, columns = ["IDF"])
idf_

Out[42]:

	IDF
김치	0.693147
나는	0.000000
된장	0.693147
안먹어	0.693147
점심	0.693147
찌개	0.287682
크림	0.693147
파스타	0.693147

In [43]:

result = []
for i in range(N):
    result.append([])
    d = docs[i]
    for j in range(len(vocab)):
        t = vocab[j]

        result[-1].append(tfidf(t,d))

tfidf_ = pd.DataFrame(result, columns = vocab)
tfidf_

Out[43]:

	김치	된장	안먹어	점심	찌개	크림	파스타
0	0.693147	0.000000	0.000000	0.000000	0.287682	0.000000	0.000000
1	0.000000	0.000000	0.000000	0.000000	0.000000	0.693147	0.693147
2	0.000000	0.693147	0.000000	0.000000	0.287682	0.000000	0.000000
3	0.000000	0.000000	0.693147	0.693147	0.000000	0.000000	0.000000

In [44]:

from sklearn.feature_extraction.text import CountVectorizer
corpus = [
  '나는 김치 찌개',
  '나는 크림 파스타',
  '나는 된장 찌개',
  '점심 안먹어'  
]
vector = CountVectorizer()
print(vector.fit_transform(corpus).toarray()) # 코퍼스로부터 각 단어의 빈도 수를 기록한다.
print(vector.vocabulary_) # 각 단어의 인덱스가 어떻게 부여되었는지를 보여준다.

[[1 1 0 0 0 1 0 0]
 [0 1 0 0 0 0 1 1]
 [0 1 1 0 0 1 0 0]
 [0 0 0 1 1 0 0 0]]
{'나는': 1, '김치': 0, '찌개': 5, '크림': 6, '파스타': 7, '된장': 2, '점심': 4, '안먹어': 3}

In [46]:

from sklearn.feature_extraction.text import TfidfVectorizer
corpus = [
  '나는 김치 찌개',
  '나는 크림 파스타',
  '나는 된장 찌개',
  '점심 안먹어'     
]
tfidfv = TfidfVectorizer().fit(corpus)
print(tfidfv.transform(corpus).toarray())
print(tfidfv.vocabulary_)

[[0.70203482 0.44809973 0.         0.         0.         0.55349232
  0.         0.        ]
 [0.         0.41137791 0.         0.         0.         0.
  0.64450299 0.64450299]
 [0.         0.44809973 0.70203482 0.         0.         0.55349232
  0.         0.        ]
 [0.         0.         0.         0.70710678 0.70710678 0.
  0.         0.        ]]
{'나는': 1, '김치': 0, '찌개': 5, '크림': 6, '파스타': 7, '된장': 2, '점심': 4, '안먹어': 3}

In [48]:

from sklearn.feature_extraction.text import TfidfVectorizer
X = ['Tom plays soccer','Tom loves soccer and baseball','baseball is his hobby and his job']
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf_vectorizer.fit_transform(X)

In [51]:

print(tfidf_matrix)

  (0, 5)	0.5178561161676974
  (0, 4)	0.680918560398684
  (0, 6)	0.5178561161676974
  (1, 0)	0.4598535287588349
  (1, 3)	0.6046521283053111
  (1, 5)	0.4598535287588349
  (1, 6)	0.4598535287588349
  (2, 2)	0.6227660078332259
  (2, 1)	0.6227660078332259
  (2, 0)	0.4736296010332684

In [ ]:

[답내만]답답해서 내가 만든 IT 자료

[python] 문장간 유사도 측정 (by 코사인 유사도)

문장간 유사도 측정 (코사인 유사도 활용 하기)¶

'슬기로운 건설 사무 자동화 > 2) Data 분석' 카테고리의 다른 글

댓글

티스토리툴바

[python] 문장간 유사도 측정 (by 코사인 유사도)

문장간 유사도 측정 (코사인 유사도 활용 하기)¶

'슬기로운 건설 사무 자동화 > 2) Data 분석' 카테고리의 다른 글

관련글

댓글

티스토리툴바