PDFCropper/cropper.py

141 lines
5.0 KiB
Python

import os
import fitz
import numpy as np
import argparse
import traceback
import io
from PIL import Image
def clip(value, min_value, max_value):
if min_value is not None:
if value < min_value:
value = min_value
if max_value is not None:
if value > max_value:
value = max_value
return value
def crop(args):
infile = args.input
# process filepath
infile = os.path.join(os.getcwd(), infile)
infile = '\\'.join(infile.split('/'))
inbase = infile.rsplit('.', maxsplit=1)[0]
outfile = args.output
if outfile is None:
outfile = inbase + '_crop.pdf'
outbase = outfile.rsplit('.', maxsplit=1)[0]
names = None
thresh = args.thresh
# parse working dir.
workdir = '/'.join('/'.join(infile.split('\\')).split('/')[:-1])
if workdir == '':
workdir = '.'
if infile.split('.')[-1].lower() in ['ppt', 'pptx']:
# lets process pptx.
import comtypes.client
from pptx import Presentation
# convert to pdf.
powerpoint = comtypes.client.CreateObject("Powerpoint.Application")
powerpoint.Visible = 1
slides = powerpoint.Presentations.Open(infile)
print('convert {0} to {1}'.format(infile, inbase + '.pdf'))
slides.SaveAs(inbase + '.pdf', 32)
slides.close()
# powerpoint.Quit()
# read comments.
ppt = Presentation(infile)
slides = ppt.slides
names = []
for i, slide in enumerate(slides):
anno = slide.notes_slide.notes_text_frame.text.strip()
if anno == '':
anno = '{0}_{1}.pdf'.format(outbase, i+1)
names.append(anno)
names = [n + '.pdf' if n[:-4] != '.pdf' else n for n in names]
infile = inbase + '.pdf'
if infile.split('.')[-1].lower() != 'pdf':
infile += '.pdf'
pdf = fitz.open(infile)
for i, page in enumerate(pdf):
try:
# recover crop box
page.setCropBox(page.MediaBox)
# render page as img
zoom = args.zoom
mat = fitz.Matrix(zoom, zoom)
img = page.getPixmap(matrix=mat)
# convert the img to nparray.
# img = [[img.pixel(j, i) for j in range(img.w)] for i in range(img.h)]
img = Image.open(io.BytesIO(img.tobytes()))
img = np.asarray(img, dtype=np.float32)
# get ltrb
background = np.broadcast_to(args.background_color, (img.shape))
diff = np.abs(img - background).mean(axis=2)
points = np.argwhere(diff > thresh)
x = points[:, 1]
y = points[:, 0]
left, right = min(x), max(x)
top, bottom = min(y), max(y)
right += 1
bottom += 1
# calculate real crop coordinate.
ltrb = left, top, right, bottom
ltrb = [coord/zoom for coord in ltrb]
# reserve border.
border = args.border
left, top, right, bottom = [x+y for x, y in zip(ltrb, [-border,-border,border,border])]
# clip ltrb
left = clip(left, 0, img.shape[1]/zoom)
top = clip(top, 0, img.shape[0]/zoom)
right = clip(right, 0, img.shape[1]/zoom)
bottom = clip(bottom, 0, img.shape[0]/zoom)
if args.visual:
print('cropbox =', (left, top), (right, bottom))
page.setCropBox(fitz.Rect(left, top, right, bottom))
except KeyboardInterrupt:
print('Operation Cancled.')
break
except:
print('Error happened while cropint page', i+1)
print(traceback.format_exc())
if not args.split:
pdf.save(outfile)
else:
if names is None:
names = ['{0}_{1}.pdf'.format(outbase, i+1) for i in range(len(pdf))]
if args.names is not None:
names = args.names
namesfile = os.path.join(workdir, names)
if os.path.isfile(namesfile) and names.split('.')[-1] == 'txt':
with open(namesfile, 'r', encoding='utf-8') as f:
names = f.read()
names = names.split('\n')
names = [n.strip() for n in names]
names = [n for n in names if n != '']
else:
names = names.split(',')
names = [n + '.pdf' if n[:-4] != '.pdf' else n for n in names]
if len(names) < len(pdf):
names += ['{0}_{1}.pdf'.format(outbase, i+1) for i in range(len(names), len(pdf))]
names = [os.path.join(workdir, name) for name in names]
for i, (name, page) in enumerate(zip(names, pdf)):
page_pdf = fitz.Document()
page_pdf.insertPDF(pdf, from_page=i, to_page=i)
page_pdf.save(name)
if not args.mute:
print('saving cropped page to', name)
pdf.close()