TensorFlow

TensorFlow 기초 34 - 한글 데이터로 워드 카운트

코딩탕탕 2022. 12. 13. 11:12

 

한글 데이터로 워드 카운트

# 한글 데이터로 워드 카운트

from sklearn.feature_extraction.text import CountVectorizer

text_data = ['나는 배 고프다 아니 배가 고프다.', '오늘 점심 뭐 먹지?', '내일 공부 해야겠다.', '점심 먹고 공부 해야지!']

count_vec = CountVectorizer(analyzer='word', min_df=1) # word는 단어별, char는 글자별
# count_vec = CountVectorizer(analyzer='word', min_df=1, ngram_range=(1, 1))
# count_vec = CountVectorizer(analyzer='word', min_df=1, ngram_range=(3, 3))
# count_vec = CountVectorizer(analyzer='word', min_df=1, max_df=5)
# count_vec = CountVectorizer(stop_words=['나는', '해야지']) # stop_words에 들어있는 단어(불용어)는 제외된다.

count_vec.fit(raw_documents=text_data)
print(count_vec.get_feature_names_out())
print(count_vec.vocabulary_) # 단어사전으로 순서 출력
# transform()으로 벡터화
print([text_data[0]])
sentence = [text_data[0]]
print(count_vec.transform(raw_documents=sentence))
print(count_vec.transform(sentence))
print(count_vec.transform(sentence).toarray())


<console>
['고프다' '공부' '나는' '내일' '먹고' '먹지' '배가' '아니' '오늘' '점심' '해야겠다' '해야지']
{'나는': 2, '고프다': 0, '아니': 7, '배가': 6, '오늘': 8, '점심': 9, '먹지': 5, '내일': 3, '공부': 1, '해야겠다': 10, '먹고': 4, '해야지': 11}
['나는 배 고프다 아니 배가 고프다.']
  (0, 0)	2
  (0, 2)	1
  (0, 6)	1
  (0, 7)	1
  (0, 0)	2
  (0, 2)	1
  (0, 6)	1
  (0, 7)	1
[[2 0 1 0 0 0 1 1 0 0 0 0]]

 

 

 

형태소 분석 후 워드 카운트

from konlpy.tag import Okt

okt = Okt()
my_words = []
for i, doc in enumerate(text_data):
    for word in okt.pos(doc, stem=True):
        # print(word)
        if word[1] in ['Noun', 'Verb', 'Adjective']:
            my_words.append(word[0])
            
print(my_words)

count_vec = CountVectorizer(analyzer='word', min_df=1, ngram_range=(1, 1))
count_vec.fit(my_words)
print(count_vec.get_feature_names_out()) # 기본적으로 한 글자는 제외된다.
print(count_vec.vocabulary_)
print(count_vec.transform(my_words))
print(count_vec.transform(my_words).toarray())

print('---------------')
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vec = TfidfVectorizer(analyzer='word', min_df=1)
tfidf_vec.fit(my_words)
print(tfidf_vec.get_feature_names_out()) # 기본적으로 한 글자는 제외된다.
print(tfidf_vec.vocabulary_)
print(tfidf_vec.transform(my_words))
print(tfidf_vec.transform(my_words).toarray())



<console>
['고프다' '공부' '내일' '먹다' '아니다' '오늘' '점심' '하다']
{'고프다': 0, '아니다': 4, '오늘': 5, '점심': 6, '먹다': 3, '내일': 2, '공부': 1, '하다': 7}
  (2, 0)	1.0
  (3, 4)	1.0
  (5, 0)	1.0
  (6, 5)	1.0
  (7, 6)	1.0
  (9, 3)	1.0
  (10, 2)	1.0
  (11, 1)	1.0
  (12, 7)	1.0
  (13, 6)	1.0
  (14, 3)	1.0
  (15, 1)	1.0
  (16, 7)	1.0
[[0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 1. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 1.]]