반응형
import pandas as pd
import multiprocessing
import numpy as np
from collections import defaultdict
from tqdm import trange
import re
import numpy as np
from collections import defaultdict
data = pd.read_csv('twitter.csv',engine='python',encoding='CP949')
del data['username']
def pre_1(data):
# data.drop_duplicaties()
data.dropna(axis=0)
data.dropna(subset=["content"],axis = 0, inplace = True)
# sum만큼 데이터 늘리기
for i in trange(len(data)):
if data['sum'][i] > 1:
row = data.loc[i]
for j in range(data['sum'][i]-1):
data = data.append(pd.Series(row, index=data.columns), ignore_index=True)
data.sort_values(by='tweet_date')
del data['sum']
return data
# Basic Cleaning Text Function
def Twitter_use_only(readData, Num=False, Eng=False):
list_text = []
for i in trange(len(data['content'])):
pre_text = data['content'][i]
# Retweets 제거
text = re.sub('RT @[\w_]+: ', '', pre_text)
# enticons 제거
text = re.sub('@[\w_]+', '', text)
# URL 제거
text = re.sub(r"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+", ' ', text) # http로 시작되는 url
text = re.sub(r"[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{2,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)", ' ', text) # http로 시작되지 않는 url
# pattern = '(http|ftp|https)://(?:[-\w.]|(?:\da-fa-F]{2}))+'
# text = re.sub(pattern = pattern, repl = ' ',string=text)
# Hashtag 제거
text = re.sub('[#]+[0-9a-zA-Z_]+', ' ', text)
# 쓰레기 단어 제거
text = re.sub('[&]+[a-z]+', ' ', text)
# 특수문자 제거
text = re.sub('[^0-9a-zA-Zㄱ-ㅎ가-힣]', ' ', text)
# 띄어쓰기 제거
text = text.replace('\n',' ')
if Num is True:
# 숫자 제거
text = re.sub(r'\d+',' ',text)
if Eng is True:
# 영어 제거
text = re.sub('[a-zA-Z]' , ' ', text)
# 정리
text = ' '.join(text.split())
list_text.append(text)
# print(list_text)
del data['content']
return list_text
# 문장 단어별 split
def WordList(data):
f_list = []
data['text'] = data['text'].astype(str)
for i in trange(len(data)):
f_list.append(data['text'][i].split())
return f_list
data = pre_1(data)
text = Twitter_use_only(data)
text_df = pd.DataFrame(text)
data_df = pd.concat([data,text_df],axis=1)
data_df.columns = ['tweet_date', 'text']
data_df_reset = data_df.reset_index(drop=True)
aa = WordList(data_df_reset)
ll = data_df_reset['text']
ll_list = ll.tolist()
# okt.nouns
from konlpy.tag import Okt
okt = Okt()
list_1 = ll_list[:277374]
list_2 = ll_list[277374:]
import sys
o_ll = []
for i in trange(len(list_1)):
o_ll.append(okt.nouns(list_1[i]))
sys.stdout.flush()
o_ll2 = []
for i in trange(len(list_2)):
o_ll2.append(okt.nouns(list_2[i]))
sys.stdout.flush()
ss = []
sa = []
for i in range(len(o_ll)):
ss.append(", ".join(o_ll[i]))
for i in range(len(o_ll2)):
sa.append(", ".join(o_ll2[i]))
ss_df = pd.DataFrame(ss)
sa_df = pd.DataFrame(sa)
ss_df.to_csv('sample1.csv')
sa_df.to_csv('sample2.csv')
반응형
'-------------코딩------------- > Python 기초 코딩' 카테고리의 다른 글
ValueError: invalid literal for int() with base10: ' ' (0) | 2020.10.19 |
---|---|
네이버 인기검색어 크롤링 (0) | 2020.10.19 |
트위터 크롤링 (0) | 2020.07.28 |
네이버 증권뉴스 크롤링(2) (0) | 2020.07.27 |
네이버 증권뉴스 크롤링(1) (0) | 2020.07.27 |
댓글