146 lines
5.1 KiB
Python
146 lines
5.1 KiB
Python
import os
|
|
import fitz
|
|
import numpy as np
|
|
import argparse
|
|
import traceback
|
|
import io
|
|
from PIL import Image
|
|
|
|
|
|
def clip(value, min_value, max_value):
|
|
if min_value is not None:
|
|
if value < min_value:
|
|
value = min_value
|
|
if max_value is not None:
|
|
if value > max_value:
|
|
value = max_value
|
|
return value
|
|
|
|
def parse_workdir(filepath):
|
|
workdir = '/'.join('/'.join(filepath.split('\\')).split('/')[:-1])
|
|
if workdir == '':
|
|
workdir = '.'
|
|
return workdir
|
|
|
|
def crop(args):
|
|
infile = args.input
|
|
# process filepath
|
|
infile = os.path.join(os.getcwd(), infile)
|
|
infile = '\\'.join(infile.split('/'))
|
|
inbase = infile.rsplit('.', maxsplit=1)[0]
|
|
|
|
outfile = args.output
|
|
if outfile is None:
|
|
outfile = inbase + '_crop.pdf'
|
|
outbase = outfile.rsplit('.', maxsplit=1)[0]
|
|
names = None
|
|
thresh = args.thresh
|
|
|
|
# parse working dir.
|
|
inworkdir = parse_workdir(infile)
|
|
outworkdir = parse_workdir(outfile)
|
|
|
|
if infile.split('.')[-1].lower() in ['ppt', 'pptx']:
|
|
# lets process pptx.
|
|
import comtypes.client
|
|
from pptx import Presentation
|
|
# convert to pdf.
|
|
powerpoint = comtypes.client.CreateObject("Powerpoint.Application")
|
|
powerpoint.Visible = 1
|
|
slides = powerpoint.Presentations.Open(infile)
|
|
print('convert {0} to {1}'.format(infile, inbase + '.pdf'))
|
|
slides.SaveAs(inbase + '.pdf', 32)
|
|
slides.close()
|
|
# powerpoint.Quit()
|
|
# read comments.
|
|
ppt = Presentation(infile)
|
|
slides = ppt.slides
|
|
names = []
|
|
for i, slide in enumerate(slides):
|
|
anno = slide.notes_slide.notes_text_frame.text.strip()
|
|
if anno == '':
|
|
anno = '{0}_{1}.pdf'.format(outbase, i+1)
|
|
names.append(anno)
|
|
names = [n + '.pdf' if n[:-4] != '.pdf' else n for n in names]
|
|
infile = inbase + '.pdf'
|
|
|
|
if infile.split('.')[-1].lower() != 'pdf':
|
|
infile += '.pdf'
|
|
|
|
|
|
pdf = fitz.open(infile)
|
|
for i, page in enumerate(pdf):
|
|
try:
|
|
# recover crop box
|
|
page.setCropBox(page.MediaBox)
|
|
# render page as img
|
|
zoom = args.zoom
|
|
mat = fitz.Matrix(zoom, zoom)
|
|
img = page.getPixmap(matrix=mat)
|
|
|
|
# convert the img to nparray.
|
|
# img = [[img.pixel(j, i) for j in range(img.w)] for i in range(img.h)]
|
|
img = Image.open(io.BytesIO(img.tobytes()))
|
|
img = np.asarray(img, dtype=np.float32)
|
|
|
|
# get ltrb
|
|
background = np.broadcast_to(args.background_color, (img.shape))
|
|
diff = np.abs(img - background).mean(axis=2)
|
|
points = np.argwhere(diff > thresh)
|
|
x = points[:, 1]
|
|
y = points[:, 0]
|
|
left, right = min(x), max(x)
|
|
top, bottom = min(y), max(y)
|
|
right += 1
|
|
bottom += 1
|
|
|
|
# calculate real crop coordinate.
|
|
ltrb = left, top, right, bottom
|
|
ltrb = [coord/zoom for coord in ltrb]
|
|
|
|
# reserve border.
|
|
border = args.border
|
|
left, top, right, bottom = [x+y for x, y in zip(ltrb, [-border,-border,border,border])]
|
|
|
|
# clip ltrb
|
|
left = clip(left, 0, img.shape[1]/zoom)
|
|
top = clip(top, 0, img.shape[0]/zoom)
|
|
right = clip(right, 0, img.shape[1]/zoom)
|
|
bottom = clip(bottom, 0, img.shape[0]/zoom)
|
|
if args.visual:
|
|
print('cropbox =', (left, top), (right, bottom))
|
|
page.setCropBox(fitz.Rect(left, top, right, bottom))
|
|
except KeyboardInterrupt:
|
|
print('Operation Cancled.')
|
|
break
|
|
except:
|
|
print('Error happened while cropint page', i+1)
|
|
print(traceback.format_exc())
|
|
|
|
if not args.split:
|
|
pdf.save(outfile)
|
|
else:
|
|
if names is None:
|
|
names = ['{0}_{1}.pdf'.format(outbase, i+1) for i in range(len(pdf))]
|
|
if args.names is not None:
|
|
names = args.names
|
|
namesfile = os.path.join(inworkdir, names)
|
|
if os.path.isfile(namesfile) and names.split('.')[-1] == 'txt':
|
|
with open(namesfile, 'r', encoding='utf-8') as f:
|
|
names = f.read()
|
|
names = names.split('\n')
|
|
names = [n.strip() for n in names]
|
|
names = [n for n in names if n != '']
|
|
else:
|
|
names = names.split(',')
|
|
names = [n + '.pdf' if n[:-4] != '.pdf' else n for n in names]
|
|
if len(names) < len(pdf):
|
|
names += ['{0}_{1}.pdf'.format(outbase, i+1) for i in range(len(names), len(pdf))]
|
|
names = [os.path.join(outworkdir, name) for name in names]
|
|
for i, (name, page) in enumerate(zip(names, pdf)):
|
|
page_pdf = fitz.Document()
|
|
page_pdf.insertPDF(pdf, from_page=i, to_page=i)
|
|
page_pdf.save(name)
|
|
if not args.mute:
|
|
print('saving cropped page to', name)
|
|
pdf.close() |