!/usr/bin env python3
author = "stephen"
time = '2018_09_03'
import PyPDF2
from PyPDF2 import utils
import os
import shutil
class remove_water_mark(object):
def __init__(self):
self.pdf_path = '/home/shenjianlin/fake_pdf/'
self.remove_path='/home/shenjianlin/fake_remove_path/'
self.trash_pdf_path='/home/shenjianlin/rubbish_pdf/'
self.del_path='/home/shenjianlin/pdf_file/del_file/'
def get_pdf(self):
file_list = [ file for file in os.listdir(self.pdf_path) ]
return file_list
def read_content(self):
pdf_output = PyPDF2.PdfFileWriter()
fileList = self.get_pdf()
for old_file in fileList:
new_file=old_file.replace(' ','-').replace('“','').replace('”','').replace('(','').replace(')','').replace(':','-')
old_file=self.pdf_path+old_file
if '.pdf' not in new_file:
new_file=new_file+'.pdf'
new_file=self.pdf_path+new_file
os.rename(old_file,new_file)
if os.path.isfile(new_file) and '.py' not in new_file :
print('读取的文件为%s'%new_file)
try:
pdf = PyPDF2.PdfFileReader(open(new_file, "rb"))
except Exception as e:
print(e)
print('文件有问题正在移动')
shutil.move(new_file, self.del_path + os.path.basename(new_file))
continue
if pdf.isEncrypted:
try:
pdf.decrypt('')
print('File decrypted pdf')
except:
command = ("cp " + new_file +
" temp.pdf; qpdf --password='' --decrypt temp.pdf " + new_file
+ "; rm temp.pdf")
os.system(command)
print('File Decrypted (qpdf)')
pdf = PyPDF2.PdfFileReader(open(new_file, "rb"))
flag=True
for i in range(0, pdf.getNumPages()):
if i == 1 and 'FormXob.86cdf15f1994e2f2b7032e461afd4234' not in str(pdf.getPage(i).get('/Resources')):
print('没有水印水处理,直接移动文件夹')
print('移动的文件夹为%s' % os.path.basename(new_file))
shutil.copy(new_file,self.remove_path)
shutil.move(new_file, self.trash_pdf_path)
flag = False
break
elif i < 3 and'FormXob.86cdf15f1994e2f2b7032e461afd4234' in str(pdf.getPage(i).get('/Resources')):
print('有水印需要去除水印')
Num_page_content = pdf.getPage(i)
if Num_page_content.get('/Resources'):
page_resource = Num_page_content['/Resources']
if page_resource.get('/XObject'):
xobject = page_resource['/XObject']
form = None
for item in xobject:
if item.startswith('/FormXob'):
form = item
if form:
print('remove water mark in page: {}'.format(i))
xobject.pop(form)
pdf_output.addPage(Num_page_content)
else:
pdf_output.addPage(pdf.getPage(i))
if flag:
with open(self.remove_path+os.path.basename(new_file), 'wb') as outfile:
try:
pdf_output.write(outfile)
except Exception as e:
print(e)
print('watermark is get over')
print('此时的文件夹为%s' % new_file)
shutil.move(new_file, self.trash_pdf_path+os.path.basename(new_file))
print('已经移动文件夹')
print('\n')
else:
print()
continue
if name == "__main__":
remove_water_mark().read_content()
题目描述
我这里想在第一页的时候直接转移文件,但是在处理pdf 的时候 把第一个pdf 的第一个页面赋给第二个pdf 的第一个页面去了,页面错乱了