python判断并合并CSV文件

有如下两个文件
one.csv

列0,  列1,   列2,                 列3
1,    a,     ww.ok;ww.ant;,      anything
2,    e,     ww.kdi;,            ihy
3,    se,    ww.sdd,             sld
4,    sd,    ww.akd,             sdjfa

two.csv

列0,  列1,    列2,                 列3
1,    sd,     ww.ok;,             1245
2,    2e3,    ww.kdi;,            432
3,    de,     ww.sdd;,            232

目标是先判断one.csv列2是否包含two.csv文件列2,如果包含,则将two.csv列3的数据追加到one.csv对应行的末尾,最终结果效果如下:

result.csv

列0,  列1,   列2,                列3,        列4
1,    a,     ww.ok;ww.ant;,     anything    1245
2,    e,     ww.kdi;,           ihy         432
3,    se,    ww.sdd,            sld         232
4,    sd,    ww.akd,            sdjfa

自己写了几行代码,一直卡在追加这一块,因为实际要处理的数据量大,不像给出的样本这样,以one.csv为基础遍历,则比对的数据不全,以two.csv为基础遍历,则写文件不知道如何进行。求前辈们指点。

import csv
import datetime


start = datetime.datetime.now()

with open('D:\one.csv') as one:
    ic_rd = csv.reader(one)
    next(ic_rd)
    for i in ic_rd:
        with open('D:\two.csv') as ga:
            ga_rd = csv.reader(ga)
            next(ga_rd)
            for g in ga_rd:
                if g[2] in i[2]:
                    i.append(g[2])                 
                    break
        with open('D:\\result.csv','a+') as rs:
            writer = csv.writer(rs)
            writer.writerow(i)

end = datetime.datetime.now()
print (end-start)
阅读 3.6k
2 个回答
# coding: utf-8

import pandas as pd

lst1 = [
    [1, 'a', 'ww.ok;ww.ant', 'anything'],
    [2, 'e', 'ww.kdi', 'ihy'],
    [3, 'se', 'ww.sdd', 'sld'],
    [4, 'sd', 'ww.akd', 'sdjfa']
]

lst2 = [
    [1, 'sd', 'ww.ant', 1245],
    [2,'2e3', 'ww.kdi', 432],
    [3,'de', 'ww.sdd', 232]
]

df1 = pd.DataFrame(lst1, columns=['A1', 'B1', 'C1', 'D1'])
df2 = pd.DataFrame(lst2, columns=['A2', 'B2', 'C2', 'D2'])

#列拆成多行
df11 = df1.join(df1['C1'].str.split(';', expand=True).stack().reset_index(level=1, drop=True).rename('E'))
df22 = df2.join(df2['C2'].str.split(';', expand=True).stack().reset_index(level=1, drop=True).rename('E'))

#左连接, 并只取需要的字段
df3 = pd.merge(df11, df22, how='left', on='E').loc[:, ['A1', 'B1', 'C1', 'D1', 'D2']]

#去重
print df3.groupby(['A1', 'B1', 'C1', 'D1'], as_index=False).max()

我新手上路,用正则表达式去根据你给出的例子的数据去匹配的,我是根据two给出的值去匹配one,所以前提是two.csv文件不能太大,我刚刚测试了一千万条(大概500M左右,每一百条有一条匹配)的数据用了4分钟,效率不是很高,程序是默认编码,如果要转码效率可能会再差点,数据真是超大的话我觉得还是用c语言去写工作效率会高点

#!/usr/bin/env python
# -*- coding: utf-8 -*-

import re
import os
#用来匹配two.csv列2和列3的值
reg_list2 = re.compile(r'.+?,.+?,\s*(.+?)\s*[;]*\s*,\s*(\d+)\s*')
#用来匹配one.csv列2的值
reg_name = re.compile(r'.+?,.+?,\s*(.+?)\s*[;]*,')
#匹配头
reg_title = re.compile(r'\s*\S+.+')
def main(argv):
    try:
        fptwo = open("two.csv", "r")
        fpone = open("one.csv", "r")
        fpnew = open("result.csv", "a")
    except UnicodeDecodeError:
        exit(1)
    text = fptwo.read()
    fptwo.close()
    
    data_two = [(re.escape(x),n) for x,n in reg_list2.findall(text)]
    otime = datetime.datetime.now()
    title = None
    new = "列4\n" #新列名字
    tell = fpone.tell()
    nu = 0    
    
    while True:
        line = fpone.readline()
        mate = reg_name.match(line)

        if not title:
            mate = reg_title.search(line)
            if mate:
                title = mate.group(0)
                title = (title + ",").ljust(len(title)+15) + new
                fpnew.write(title)
                continue
        
        if mate:
            name = mate.group(1)
            for item in data_two:
                if re.search(item[0],name):
                    line = line[:-1]
                    line = (line + ",").ljust(len(title))  + item[1] + "\n"
                    print ("Pos L:", nu, "name: ", name)
        
        fpnew.write(line)
        nu += 1
        if tell == fpone.tell():
            break
        else:
            tell = fpone.tell()
    fpone.close()
    fpnew.close()
    return 0

if __name__ == '__main__':
    import sys
    sys.exit(main(sys.argv))
撰写回答
你尚未登录,登录后可以
  • 和开发者交流问题的细节
  • 关注并接收问题和回答的更新提醒
  • 参与内容的编辑和改进,让解决方法与时俱进
推荐问题