[Python] 파일에서 필요한 데이터를 추출해 파일로 저장하기

2017-03-10 by 편리

아래의 코드는 파이썬을 공부하면서 만들었기 때문에 최적화 등과는 거리가 멀다. 테스트 목적의 코드이기 때문에 정상작동은 보장할 수 없다. 판매자와 상품코드가 탭으로 분리된 큰 용량의 텍스트 파일에서 각 판매자 별로 상품코드를 분리해서 각각 텍스트로 파일로 저장한다. 상품코드 중 exclude_item_id.txt 파일에 저장된 코드는 제외한다.

# _*_ coding: utf-8 _*_

from multiprocessing import Process
import time
import datetime

seller = []
exclude = []
datas = []

def get_exce_time(start, end):
    return datetime.timedelta(seconds=(end - start))

def file_read():
    start = time.time()

    results = {}

    f = open('seller_id.txt', 'r', encoding="utf-8")
    lines = f.readlines()
    f.close()

    for line in lines:
        str = line.split("\t")
        id = str[4].strip()

        if id not in seller:
            seller.append(id)

    results['seller'] = seller

    f = open('exclude_item_id.txt', 'r', encoding="utf-8")
    lines = f.readlines()
    f.close()

    for line in lines:
        str = line.strip()

        if str not in exclude:
            exclude.append(str)

    results['exclude'] = exclude

    f = open('AllDataBasedOnDB.dat', 'r', encoding="utf-8")
    datas = f.readlines()
    f.close()

    results['datas'] = datas

    end = time.time()
    print("File Reading Time : {0}".format(get_exce_time(start, end)))

    return results

def find_item_code(data, process, idx, limit):
    start = time.time()

    seller = data['seller']
    exclude = data['exclude']
    datas = data['datas']

    for k, s_id in enumerate(seller):
        if k % process != idx:
            continue

        item = []

        for line in datas:
            if limit > 0 and len(item) == limit:
                break

            if line.find(s_id) == -1:
                continue

            str = line.split("\t")
            code = str[0].strip()

            if code not in exclude and code not in item:
                item.append(code)

        # txt 파일생성
        f = open('./out/' + s_id + '.txt', 'w')
        f.writelines(map(lambda x: x + "\n", item))
        f.close()

    end = time.time()
    print("Process #{0} Running Time : {1}".format(idx, get_exce_time(start, end)))

if __name__ == '__main__':
    data = file_read()

    process = 4

    #args=((data, process, i, 100000,) 의 마지막 숫자를 0으로 바꾸면 모든 레코드 추출
    procs = [Process(target=find_item_code, args=((data, process, i, 0,))) for i in range(process)]
    for p in procs: p.start()

MultiProcessing 모듈을 사용해서 다수의 프로세스로 처리하도록 했다.

Leave a Reply Cancel reply