본문 바로가기
-------------코딩-------------/Python 기초 코딩

트위터 전처리

by 탶선 2020. 8. 4.
반응형
import pandas as pd
import multiprocessing
import numpy as np
from collections import defaultdict
from tqdm import trange
import re
import numpy as np
from collections import defaultdict


data = pd.read_csv('twitter.csv',engine='python',encoding='CP949')
del data['username']

def pre_1(data):
#     data.drop_duplicaties()
    data.dropna(axis=0)
    data.dropna(subset=["content"],axis = 0, inplace = True)
    
# sum만큼 데이터 늘리기
    for i in trange(len(data)):
        if data['sum'][i] > 1:
            row = data.loc[i]

            for j in range(data['sum'][i]-1):
                data = data.append(pd.Series(row, index=data.columns), ignore_index=True)
    data.sort_values(by='tweet_date')
    
    del data['sum']
    
    return data
# Basic Cleaning Text Function
def Twitter_use_only(readData, Num=False, Eng=False):
    list_text = []
    
    for i in trange(len(data['content'])):
        
        pre_text = data['content'][i]
        # Retweets 제거
        text = re.sub('RT @[\w_]+: ', '', pre_text)

        # enticons 제거
        text = re.sub('@[\w_]+', '', text)

        # URL 제거
        text = re.sub(r"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+", ' ', text) # http로 시작되는 url
        text = re.sub(r"[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{2,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)", ' ', text) # http로 시작되지 않는 url
    #     pattern = '(http|ftp|https)://(?:[-\w.]|(?:\da-fa-F]{2}))+'
    #     text = re.sub(pattern = pattern, repl = ' ',string=text)

        # Hashtag 제거
        text = re.sub('[#]+[0-9a-zA-Z_]+', ' ', text)

        # 쓰레기 단어 제거
        text = re.sub('[&]+[a-z]+', ' ', text)

        # 특수문자 제거
        text = re.sub('[^0-9a-zA-Zㄱ-ㅎ가-힣]', ' ', text)

        # 띄어쓰기 제거
        text = text.replace('\n',' ')

        if Num is True:
            # 숫자 제거
            text = re.sub(r'\d+',' ',text)

        if Eng is True:
            # 영어 제거 
            text = re.sub('[a-zA-Z]' , ' ', text)

        # 정리
        text = ' '.join(text.split())
        list_text.append(text)
    
#         print(list_text)

    del data['content']
    
    return list_text
# 문장 단어별 split
def WordList(data):
    f_list = []
    
    data['text'] = data['text'].astype(str)
    
    for i in trange(len(data)):
        f_list.append(data['text'][i].split())
    
    return f_list
data = pre_1(data)
text = Twitter_use_only(data)

text_df = pd.DataFrame(text)
data_df = pd.concat([data,text_df],axis=1)
data_df.columns = ['tweet_date', 'text']

data_df_reset = data_df.reset_index(drop=True)

aa = WordList(data_df_reset)

ll = data_df_reset['text']

ll_list = ll.tolist()

# okt.nouns

from konlpy.tag import Okt
okt = Okt()


list_1 = ll_list[:277374]
list_2 = ll_list[277374:]

import sys

o_ll = []
for i in trange(len(list_1)):
    o_ll.append(okt.nouns(list_1[i]))
    sys.stdout.flush()

o_ll2 = []
for i in trange(len(list_2)):
    o_ll2.append(okt.nouns(list_2[i]))
    sys.stdout.flush()
    
ss = []
sa = []
for i in range(len(o_ll)):
    ss.append(", ".join(o_ll[i]))
    
for i in range(len(o_ll2)):
    sa.append(", ".join(o_ll2[i]))
    
ss_df = pd.DataFrame(ss)
sa_df = pd.DataFrame(sa)

ss_df.to_csv('sample1.csv')
sa_df.to_csv('sample2.csv')
반응형

댓글