PDFCropper/cropper.py

import os
import fitz
import numpy as np
import argparse
import traceback
import io
from PIL import Image


def clip(value, min_value, max_value):
    if min_value is not None:
        if value < min_value:
            value = min_value
    if max_value is not None:
        if value > max_value:
            value = max_value
    return value

def parse_workdir(filepath):
    workdir = '/'.join('/'.join(filepath.split('\\')).split('/')[:-1])
    if workdir == '':
        workdir = '.'
    return workdir

def crop(args):
    infile = args.input
    # process filepath
    infile = os.path.join(os.getcwd(), infile)
    infile = '\\'.join(infile.split('/'))
    inbase = infile.rsplit('.', maxsplit=1)[0]

    outfile = args.output
    if outfile is None:
        outfile = inbase + '_crop.pdf'
    outbase = outfile.rsplit('.', maxsplit=1)[0]
    names = None
    thresh = args.thresh

    # parse working dir.
    inworkdir = parse_workdir(infile)
    outworkdir = parse_workdir(outfile)

    if infile.split('.')[-1].lower() in ['ppt', 'pptx']:
        # lets process pptx.
        import comtypes.client
        from pptx import Presentation
        # convert to pdf.
        powerpoint = comtypes.client.CreateObject("Powerpoint.Application")
        powerpoint.Visible = 1
        slides = powerpoint.Presentations.Open(infile)
        print('convert {0} to {1}'.format(infile, inbase + '.pdf'))
        slides.SaveAs(inbase + '.pdf', 32)
        slides.close()
        # powerpoint.Quit()
        # read comments.
        ppt = Presentation(infile)
        slides = ppt.slides
        names = []
        for i, slide in enumerate(slides):
            anno = slide.notes_slide.notes_text_frame.text.strip()
            if anno == '':
                anno = '{0}_{1}.pdf'.format(outbase, i+1)
            names.append(anno)
        names = [n + '.pdf' if n[:-4] != '.pdf' else n for n in names]
        infile = inbase + '.pdf'

    if infile.split('.')[-1].lower() != 'pdf':
        infile += '.pdf'


    pdf = fitz.open(infile)
    for i, page in enumerate(pdf):
        try:
            # recover crop box
            page.setCropBox(page.MediaBox)
            # render page as img
            zoom = args.zoom
            mat = fitz.Matrix(zoom, zoom)
            img = page.getPixmap(matrix=mat)

            # convert the img to nparray.
            # img = [[img.pixel(j, i) for j in range(img.w)] for i in range(img.h)]
            img = Image.open(io.BytesIO(img.tobytes()))
            img = np.asarray(img, dtype=np.float32)

            # get ltrb
            background = np.broadcast_to(args.background_color, (img.shape))
            diff = np.abs(img - background).mean(axis=2)
            points = np.argwhere(diff > thresh)
            x = points[:, 1]
            y = points[:, 0]
            left, right = min(x), max(x)
            top, bottom = min(y), max(y)
            right += 1
            bottom += 1

            # calculate real crop coordinate.
            ltrb = left, top, right, bottom
            ltrb = [coord/zoom for coord in ltrb]

            # reserve border.
            border = args.border
            left, top, right, bottom = [x+y for x, y in zip(ltrb, [-border,-border,border,border])]

            # clip ltrb
            left = clip(left, 0, img.shape[1]/zoom)
            top = clip(top, 0, img.shape[0]/zoom)
            right = clip(right, 0, img.shape[1]/zoom)
            bottom = clip(bottom, 0, img.shape[0]/zoom)
            if args.visual:
                print('cropbox =', (left, top), (right, bottom))
            page.setCropBox(fitz.Rect(left, top, right, bottom))
        except KeyboardInterrupt:
            print('Operation Cancled.')
            break
        except:
            print('Error happened while cropint page', i+1)
            print(traceback.format_exc())

    if not args.split:
        pdf.save(outfile)
    else:
        if names is None:
            names = ['{0}_{1}.pdf'.format(outbase, i+1) for i in range(len(pdf))]
        if args.names is not None:
            names = args.names
            namesfile = os.path.join(inworkdir, names)
            if os.path.isfile(namesfile) and names.split('.')[-1] == 'txt':
                with open(namesfile, 'r', encoding='utf-8') as f:
                    names = f.read()
                names = names.split('\n')
                names = [n.strip() for n in names]
                names = [n for n in names if n != '']
            else:
                names = names.split(',')
            names = [n + '.pdf' if n[:-4] != '.pdf' else n for n in names]
        if len(names) < len(pdf):
            names += ['{0}_{1}.pdf'.format(outbase, i+1) for i in range(len(names), len(pdf))]
        names = [os.path.join(outworkdir, name) for name in names]
        for i, (name, page) in enumerate(zip(names, pdf)):
            page_pdf = fitz.Document()
            page_pdf.insertPDF(pdf, from_page=i, to_page=i)
            page_pdf.save(name)
            if not args.mute:
                print('saving cropped page to', name)
    pdf.close()