python优化代码（文本查找）效率？

#!/usr/bin/env python
# -*- coding: utf-8 -*-
infile2 = open('genemark.gff3', 'r')
infile1 = set(line1.strip() for line1 in open('1.txt', 'r'))
for line in infile2:
    line = line.strip().split()
    if line[2] == 'gene':
        chr, start, end = line[0], int(line[3]), int(line[4])
        for line1 in infile1:
            line1 = line1.split()
            chr1, start1, end1 = line1[1], int(line1[2]), int(line1[3])
            if chr1 == chr:
                if start1 < start < end1:
                    print line1[0], line[-1]
                if start1 < end < end1:
                    print line1[0], line[-1]
                if start1 > start and end > end1:
                    print line1[0], line[-1]

genemark.gff3 格式类似下边:

chr1D    GeneMark.hmm    gene    2705930    2711118    .    +    .    ID=1903228_g;Name=1903228_g
chr1D    GeneMark.hmm    mRNA    2705930    2711118    .    +    .    ID=1903228_t;Name=1903228_t;Parent=1903228_g

1.txt:

UN011157 chr1D 2705329 2706342 98.4 95.0 972 30 21 0
UN003843 chr1D 2705681 2721144 61.4 97.4 633 12 5 0

附上原始文件的百度云链接，希望感兴趣的参考
点击下载密码 enu8

综合楼下各位朋友的答案，现推荐两种
第一种根据 @ferstar @用筹兮用严的答案，即并行版

#!/usr/bin/env python
# encoding: utf-8

from collections import defaultdict
from multiprocessing import Pool, cpu_count
from functools import partial


def find_sth(f2, f1=None):
    start, end = int(f2[3]), int(f2[4])
    for uno1, start1, end1 in f1[f2[0]]:
        if (start1 <= start and start <= end1) or (start1 <= end and end <= end1) or (start1 >= start and end >= end1):
            with open("out.txt", "a") as fh:
                fh.write(uno1 + "\t" + f2[-1] + "\n")
                #print(uno1, f2[-1])


def main():
    with open('1.txt', 'r') as f1:
        infile1 = defaultdict(set)
        for uno1, chr1, start1, end1, *others in map(str.split, f1):
            infile1[chr1].add((uno1, int(start1), int(end1)))
    with open('genemark.gff3', 'r') as f2:
        infile2 = [x for x in map(str.split, f2) if x[2] == 'gene']
    pool = Pool(cpu_count())
    pool.map(partial(find_sth, f1=infile1), infile2)
    pool.close()
    pool.join()


if __name__ == "__main__":
    main()

第二种 @citaret 他的版本（单核版），对单核来说，不逊于上述代码。但是两者结果稍有不同，并行版结果更全（这里少了73条，出在判断条件的边界问题，由于对intervaltree熟悉，怎么改还不知道），现在边界问题已修改，两种代码结果完全一样，perfect！
如下


from collections import defaultdict
from intervaltree import Interval, IntervalTree

with open('1.txt') as f:
    d1 = defaultdict(list)
    xs = map(lambda x: x.strip().split(), f)
    for x in xs:
        y = (x[0], int(x[2]), int(x[3]))
        d1[x[1]].append(y)

    for k, v in d1.items():
        d1[k] = IntervalTree(Interval(s, e, u) for u, s, e in v)

with open('genemark.gff3') as f:
    for line in f:
        line = line.strip().split()
        if line[2] == 'gene':
            chr, start, end = line[0], int(line[3]), int(line[4])
            for start1, end1, un1 in d1[chr][start-1:end+1]:
                print(un1, line[-1])

python

阅读 7.1k

7 个回答

Fisher_Wong

update 2016.6.5

关于如何使用多进程加速，并且使用 intervaltree 的方法，我试了很多代码，抛弃掉使用进程池按行分发task的方法，原因是处理一行的计算量很小，而频繁给进程函数频繁传参区间树字典的损耗却很大，大家可以试一试处理楼主的4G文件，比但进程要慢多了。

改进的思路是，文件按块读取，这样IO会很快，而进程函数处理的单位以大块计，才有了分摊计算量的意义。我还是用了多进程+Queue，进程数和和每次读取文件块的大小也许可以进一步调优。在我的xps13笔记本上，跑完4G文件用的时间是：

 312.09s user 49.00s system 587% cpu 1:01.46 total

以下是代码，希望能抛砖引玉

 #!/usr/bin/env python3

from collections import defaultdict
from intervaltree import Interval, IntervalTree
from multiprocessing import Pool, JoinableQueue, Process


def intervalDict(filename):
    with open(filename, 'r') as f:
        d = defaultdict(list)
        r = (l.strip().split() for l in f)
        for i in r:
            y = (i[0], int(i[2]), int(i[3]))
            d[i[1]].append(y)
        for k, v in d.items():
            d[k] = IntervalTree(
                    Interval(s, e, u) for u, s, e in v)
        return d


def readQ(d, q):
    while True:
        b = q.get(block=True)
        q.task_done()
        for l in b.split('\n'):
            r = l.strip().split()
            try:
                flag = r[2]
            except IndexError:
                continue
            if flag == "gene":
                k, start, end, idname = r[0], int(r[3])-1, int(r[4])+1, r[-1]
                for s, e, u in d[k][start:end]:
                    print(u, idname)



def fileBlocks(filename, size):
    with open(filename, 'r') as f:
        block = f.read(size)
        while block:
            # 多按行读取一次，保证block中无截断行
            tail = f.readline()
            yield block+tail
            block = f.read(size)


def writeQ(filename, block_size, q):
    for b in fileBlocks(filename, block_size):
        q.put(b)
    q.join()

if __name__ == '__main__':
    d = intervalDict('1.txt')
    q = JoinableQueue(0)
    producer = Process(target=writeQ, args=('genemark.gff3', 1024*1024, q))
    customers = [Process(target=readQ, args=(d, q)) for i in range(8)]
    producer.start()
    [c.start() for c in customers]
    producer.join()
    [c.terminate() for c in customers]
    [c.join() for c in customers]

------------- history

inervaltree学习受教了。
对于多进程版本，我想说，使用的pool.map 的同学，要么是电脑内存够大，要么是没试过楼主提供的数据文件，反正我的电脑是报了这样的错误

  File "d:\ProgramData\Anaconda3\lib\multiprocessing\pool.py", line 266, in map
    return self._map_async(func, iterable, mapstar, chunksize).get()
  File "d:\ProgramData\Anaconda3\lib\multiprocessing\pool.py", line 376, in _map_async
    iterable = list(iterable)
MemoryError

原因从报错信息就看得很显然，list(文件对象)撑爆了。所以这个用pool.apply() 更好
或者把含 “gene”得行提前过滤出来，这样pool.map() 时也不会撑爆内存。

prolifes

11.2k51537

发布于
2016-07-29

更新于
2016-07-29

性能应该没有更好的优化余地，不过代码可以稍微调整一下

with open('1.txt', 'r') as f1, open('2.txt', 'r') as f2:
    lines1 = [_.strip().split() for _ in f1]
    for line2 in f2:
        line2 = line2.strip().split()
        if line2[2] != 'gene':
            continue

        chr2, start2, end2 = line2[0], int(line2[3]), int(line2[4])
        for line1 in lines1:
            chr1, start1, end1 = line1[1], int(line1[2]), int(line1[3])
            if chr1 == chr2 and (start1 < start2 < end1 or start1 < end2 < end1 or start1 > start2 and end2 > end1):
                print line1[0], line2[-1]

whyreal

1.3k213

发布于
2016-07-29

更新于
2016-07-29

这里给出两个建议：

代码嵌套太深，在函数中可以通过尽早的 return 来减少嵌套层级，同样的在循环中，可以通过使用 continue 来达到减少嵌套层级的目的。
关于性能方面

for line1 in infile1:
    line1 = line1.split()

每次循环都要对对 file1中的行进行 split 操作是非常不明智的

下面是我修改的代码

#!/usr/bin/env python
# -*- coding: utf-8 -*-
infile2 = open('genemark.gff3', 'r')
infile1 = {}
for line1 in open('1.txt', 'r'):
    line1 = line1.strip().split()
    id, chr1, start1, end1 = line1[0], line1[1], int(line1[2]), int(line1[3])
    if not infile1.has_key(chr1):
        infile1[chr1] = []
    infile1[chr1].append({"start": start1, "end": end1, "id": id})

for line in infile2:
    line = line.strip().split()

    if line[2] != 'gene':
        continue

    chr, start, end = line[0], int(line[3]), int(line[4])

    if not infile1.has_key(chr):
        continue

    for i in infile1[chr]:
        if i['start'] < start < i['end']:
            print i['id'], line[-1]
        if i['start'] < end < i['end']:
            print i['id'], line[-1]
        if i['start'] > start and i['end'] > end:
            print i['id'], line[-1]

citaret

8622113

发布于
2016-07-29

更新于
2022-10-07

主循环是求出集合中和一个片段相交的所有片段，我们可以先看看这个集合：

chr1D 7359
chr2D 9219
chr2B 9486
chr2A 8986
chr6B 7178
chr6A 6446
chr6D 6093
chr4A 7543
chr4B 7086
chr4D 6316
...

每个集合的片段数量在6000-10000，遍历的话效率低，因此考虑使用 intervaltree，可以快速得出和一个片段相交的所有片段，具体实现为：

from collections import defaultdict
from intervaltree import Interval, IntervalTree

with open('1.txt') as f:
    d1 = defaultdict(list)
    xs = map(lambda x: x.strip().split(), f)
    for x in xs:
        y = (x[0], int(x[2]), int(x[3]))
        d1[x[1]].append(y)

    for k, v in d1.items():
        d1[k] = IntervalTree(Interval(s, e, u) for u, s, e in v)

with open('genemark.gff3') as f:
    for line in f:
        line = line.strip().split()
        if line[2] != 'gene': continue
        chr, start, end = line[0], int(line[3]), int(line[4])
        for start1, end1, un1 in d1[chr][start:end]:
            print un1, line[-1]

时间测试结果为：构建 intervaltree 花时 10秒，但是求交集的过程速度有100倍左右的提升。
intervaltee 参考 https://pypi.python.org/pypi/...

maxtian

发布于
2016-07-30

新手上路，请多包涵

文件发开后不用关闭吗

begin

100916

发布于
2016-07-30

更新于
2016-07-30

from collections import defaultdict

with open('1.txt', 'r') as f1, open('genemark.gff3', 'r') as f2:
    infile1 = defaultdict(set)
    for uno1, chr1, start1, end1, *others in map(str.split, f1):
        infile1[chr1].add((uno1, int(start1), int(end1)))
    infile2 = filter(lambda x: x[2] == 'gene', map(str.split, f2))
    for chr, start, end, info in map(lambda x: (x[0], int(x[3]), int(x[4]), x[-1]), infile2):
        for uno1, start1, end1 in infile1[chr]:
            if start1 < start < end1 or start1 < end < end or (start1 > start and end > end1):
                print(uno1, info)

6楼 @ferstar 提出的并行化才是正确的方向，不过代码有点问题……
改了下：

#!/usr/bin/env python
# encoding: utf-8

from collections import defaultdict
from multiprocessing import Pool, cpu_count
from functools import partial


def find_sth(line, f1=None):
    line = line.split()
    if line[2] != 'gene':
        return
    start, end = int(line[3]), int(line[4])
    for uno1, start1, end1 in f1[line[0]]:
        if start1 < start < end1 or start1 < end < end or (start1 > start and end > end1):
            print(uno1, line[-1])


def main():
    pool = Pool(cpu_count())
    with open('1.txt', 'r') as f1, open('genemark.gff3', 'r') as f2:
        infile1 = defaultdict(set)
        for uno1, chr1, start1, end1, *others in map(str.split, f1):
            infile1[chr1].add((uno1, int(start1), int(end1)))
        pool.map(partial(find_sth, f1=infile1), f2)
        #fw.writelines(filter(lambda x: x is not None, map(lambda x: x.get(), [pool.apply_async(func, (line,)) for line in f2])))
    pool.close()
    pool.join()


if __name__ == "__main__":
    main()

ferstar

1.2k1111

发布于
2016-07-30

更新于
2016-07-30

发现一个很有意思的事情, 大家回答很积极, 但是实际结果呢, 我刚好无聊小测试了一下, 过程如下:

介于题主提供的示例文本才两行, 所以我把1.txt和genemark.gff3分别加倍到4000行

(qiime) [ngs@cluster ~]$ wc -l 1.txt 
4000 1.txt
(qiime) [ngs@cluster ~]$ wc -l genemark.gff3 
4000 genemark.gff3

按照回复楼层数排序, 如题主的代码是hi.py,然后一楼答主的代码是hi1.py,依次类推

先看题主的

(qiime) [ngs@cluster ~]$ time python hi.py > hi.txt
real    0m0.049s
user    0m0.042s
sys    0m0.007s
(qiime) [ngs@cluster ~]$ wc -l hi.txt 
6000 hi.txt

感觉是有重复

一楼答主的

time python hi1.py > hi1.txt

real    0m21.727s
user    0m21.171s
sys    0m0.547s
(qiime) [ngs@cluster ~]$ wc -l hi1.txt 
8000000 hi1.txt

重复到姥姥家了

二楼答主的

(qiime) [ngs@cluster ~]$ time python hi2.py > hi2.txt

real    0m16.326s
user    0m14.550s
sys    0m1.044s
(qiime) [ngs@cluster ~]$ wc -l hi2.txt 
12000000 hi2.txt

三楼答主的

(qiime) [ngs@cluster ~]$ time python hi3.py > hi3.txt

real    0m27.079s
user    0m26.281s
sys    0m0.786s
(qiime) [ngs@cluster ~]$ wc -l hi3.txt
12000000 hi3.txt

三楼答主的结果跟二楼一样, 但是慢了10秒多

四楼答主的(冤枉四楼同学了,这是py3代码)

(py3) [ngs@cluster ~]$ time python hi4.py > hi4.txt

real    0m0.074s
user    0m0.064s
sys    0m0.010s
(py3) [ngs@cluster ~]$ wc -l hi4.txt 
4000 hi4.txt

果然是有交流才有进步, 目前这个结果才是正确的

总结

实际好像是题主的代码结果会有重复, 四楼答主的结果才是正确的

我的方案--把四楼的代码小改变成并行的

我写的有问题, @用筹兮用严更新了正确的并行代码, 我的代码就不改了, 方便后面看到的同学参考

直接放码(python3)

from collections import defaultdict
import multiprocessing


def find_sth(x):
    with open('1.txt', 'r') as f1:
        infile1 = defaultdict(set)
        for uno1, chr1, start1, end1, *others in map(str.split, f1):
            infile1[chr1].add((uno1, int(start1), int(end1)))
        chr, start, end, info = x[0], int(x[3]), int(x[4]), x[-1]
        for uno1, start1, end1 in infile1[chr]:
            if start1 < start < end1 or start1 < end < end or (start1 > start and end > end1):
                print(uno1, info)


def main():
    with open('genemark.gff3', 'r') as fh:
        lst = [x for x in map(str.split, fh) if x[2] == 'gene']
    pool = multiprocessing.Pool(multiprocessing.cpu_count())
    pool.map(find_sth, lst)
    pool.close()
    pool.join()

if __name__ == "__main__":
    main()

然后看看运行效率

(py3) [ngs@cluster ~]$ time python hi_new.py > hi_new.txt
real    0m3.033s
user    0m31.952s
sys    0m0.219s

(py3) [ngs@cluster ~]$ wc -l hi_new.txt 
4000 hi_new.txt

时间上貌似慢了很多(4000行数据才几百KB), 题主可以试着用你的真实数据测试下, 处理数据越大, 并行处理的效率优势越明显

PS: 我估计题主实际处理的数据大小得有MB甚至是GB级别, 这种级别并行处理才是王道

源数据及结果度娘地址链接：http://pan.baidu.com/s/1hrSZQuS 密码：u93n

撰写回答

你尚未登录，登录后可以

和开发者交流问题的细节
关注并接收问题和回答的更新提醒
参与内容的编辑和改进，让解决方法与时俱进

被 1 篇内容引用

按指定顺序在在文件中提取并输出
1

推荐问题

python优化代码（文本查找）效率？

总结

我的方案--把四楼的代码小改变成并行的

你尚未登录，登录后可以

字节的 trae AI IDE 不支持类似 vscode 的 ssh remote 远程开发怎么办？

DataCap 中验证码无法显示，后台出现 NullPointerException 错误?

发现深拷贝和浅拷贝效果一致：请问一下有什么区别呢？

如何实现一个深拷贝函数？

Python 成员变量在多个子类实例间共享，如何避免？

为什么 Qwen2.5-Omni-7B 官方教程都报错 Cannot import available module of Qwen2_5OmniModel in modelscope ？

Spark-TTS-0.5B 的 requirements.txt 在哪里？