pdfconcat

changeset 0:8f7e68d54c6d

initial commit: should be already usable
author markus schnalke <meillo@marmaro.de>
date Thu, 29 Aug 2013 13:58:17 +0200
parents
children 6ea97e3f7cb5
files .user.ini README bin/cleanup.cron bin/monthly-stats bin/pdfconcat bin/pdfdetextify pdfconcat.php
diffstat 7 files changed, 228 insertions(+), 0 deletions(-) [+]
line diff
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/.user.ini	Thu Aug 29 13:58:17 2013 +0200
     1.3 @@ -0,0 +1,2 @@
     1.4 +upload_max_filesize = 8M
     1.5 +post_max_size = 8M
     2.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     2.2 +++ b/README	Thu Aug 29 13:58:17 2013 +0200
     2.3 @@ -0,0 +1,53 @@
     2.4 +pdfconcat
     2.5 +---------
     2.6 +
     2.7 +This program concatenates PDF files and optionally converts text
     2.8 +within them into bitmaps. The concatenation is done with Ghostscript (gs);
     2.9 +the detextification is done with a gs-conversion to tiff and tiff2pdf
    2.10 +(package libtiff-tools) afterwards. Detextification is only in gray-scale.
    2.11 +
    2.12 +
    2.13 +Contents:
    2.14 +
    2.15 +- bin/pdfconcat		shell script to concatenate PDF files
    2.16 +- bin/pdfdetextify	shell script to convert text to images in PDF files
    2.17 +- pdfconcat.php		web interface to invoke the scripts
    2.18 +
    2.19 +- log			log file of web interface invocations (writable)
    2.20 +- bin/monthly		shell script to sum up the usage by month
    2.21 +
    2.22 +- upload/		directory to store the converted files (writable)
    2.23 +- bin/cleanup.cron	helper script to remove old uploaded files
    2.24 +
    2.25 +- .user.ini		php config file to increase the max upload file size
    2.26 +
    2.27 +
    2.28 +Installation:
    2.29 +
    2.30 +You need a Unix system. Ensure you have gs and tiff2pdf (in the package
    2.31 +libtiff-tools) available.
    2.32 +
    2.33 +To use the bin/pdf* tools on the command line only, copy them to your $PATH
    2.34 +and make them executable.
    2.35 +
    2.36 +To set up the program including the web interface, copy the files to a place
    2.37 +below the webserver root. Make bin/pdf{concat,detextify} executable for
    2.38 +www-data. Make the log and the upload directory writable by www-data. Install
    2.39 +a cronjob to clean up the upload directory. Ensure that the max upload file
    2.40 +size of PHP is large enough.
    2.41 +
    2.42 +
    2.43 +More or less helpful information sources on the PDF conversion:
    2.44 +
    2.45 +	http://stackoverflow.com/questions/6002261/pdf-to-tiff-imagemagick-problem
    2.46 +	http://www.asmail.be/msg0055376363.html
    2.47 +	http://kvz.io/blog/2007/11/28/php-tiff2pdf/
    2.48 +	http://phpdave.wordpress.com/tag/php-pdf-to-tiff/
    2.49 +
    2.50 +
    2.51 +2013, markus schnalke <meillo@marmaro.de>
    2.52 +Written at KIT-Library, Karlsruhe.
    2.53 +
    2.54 +This program is in the public domain. -- Dieses Programm hat nicht die
    2.55 +notwendige Schoepfungshoehe um urheberrechtlich geschuetzt zu sein. If
    2.56 +you though need a license, use it under the CC0 license.
     3.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     3.2 +++ b/bin/cleanup.cron	Thu Aug 29 13:58:17 2013 +0200
     3.3 @@ -0,0 +1,14 @@
     3.4 +#!/bin/sh
     3.5 +#
     3.6 +# print list of old files from the upload directory
     3.7 +# output is meant to be piped into: `xargs rm -f'
     3.8 +
     3.9 +if [ $# -lt 2 ] ; then
    3.10 +	echo "usage: ${0##*/} NUM_OF_DAYS_TO_KEEP DIR..." >&2
    3.11 +	exit 1
    3.12 +fi
    3.13 +
    3.14 +days="$1"
    3.15 +shift
    3.16 +
    3.17 +find "$@" -maxdepth 1 -mindepth 1 -atime +"$days" -print
     4.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     4.2 +++ b/bin/monthly-stats	Thu Aug 29 13:58:17 2013 +0200
     4.3 @@ -0,0 +1,5 @@
     4.4 +#!/bin/sh
     4.5 +#
     4.6 +# sum up the usage by month
     4.7 +
     4.8 +sed 's,^\[\(....-..\).*,\1,' "$@" | uniq -c
     5.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     5.2 +++ b/bin/pdfconcat	Thu Aug 29 13:58:17 2013 +0200
     5.3 @@ -0,0 +1,5 @@
     5.4 +#!/bin/sh
     5.5 +#
     5.6 +# concatenate the given PDF files to stdout
     5.7 +
     5.8 +gs -q -dNOPAUSE -dBATCH -sPAPERSIZE=a4 -dPDFSETTINGS=/prepress -sDEVICE=pdfwrite -sOutputFile=- "$@"
     6.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     6.2 +++ b/bin/pdfdetextify	Thu Aug 29 13:58:17 2013 +0200
     6.3 @@ -0,0 +1,16 @@
     6.4 +#!/bin/sh
     6.5 +#
     6.6 +# convert pdf to tiff and back to pdf in order to convert text to image
     6.7 +# writes to stdout
     6.8 +#
     6.9 +# depends on: gs, tiff2pdf (libtiff-tools)
    6.10 +
    6.11 +temp="`mktemp /tmp/${0##*/}.XXXXXX`"
    6.12 +trap 'rm -f "$temp"' 0 1 2 3 15
    6.13 +
    6.14 +for i do
    6.15 +	# echo "processing $i"
    6.16 +	gs -q -dNOPAUSE -dBATCH -sPAPERSIZE=a4 -dPDFSETTINGS=/prepress \
    6.17 +			-r300 -o "$temp" -sDEVICE=tiffgray -sCompression=lzw "$i"
    6.18 +	tiff2pdf -z "$temp"
    6.19 +done
     7.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     7.2 +++ b/pdfconcat.php	Thu Aug 29 13:58:17 2013 +0200
     7.3 @@ -0,0 +1,133 @@
     7.4 +<!--
     7.5 +	pdfconcat-0.1
     7.6 +	Written by markus schnalke <meillo@marmaro.de>,
     7.7 +		developed at KIT-Library, Karlsruhe.
     7.8 +	This is free software under the CC0 license.
     7.9 +	http://marmaro.de/prog/pdfconcat
    7.10 +-->
    7.11 +<html>
    7.12 +<head>
    7.13 +<title>PDF concat and detextify</title>
    7.14 +<meta name="author" content="markus schnalke <meillo@marmaro.de>">
    7.15 +<meta name="copyright" content="No copyright applies.">
    7.16 +</head>
    7.17 +<body>
    7.18 +<h2>PDF concat and detextify</h2>
    7.19 +
    7.20 +<?php
    7.21 +
    7.22 +define('PDFDETEXTIFY', dirname(__FILE__).'/bin/pdfdetextify');
    7.23 +define('PDFCONCAT', dirname(__FILE__).'/bin/pdfconcat');
    7.24 +
    7.25 +define('LOGFILE', dirname(__FILE__).'/log');
    7.26 +define('UPLOADDIR', 'upload');
    7.27 +
    7.28 +
    7.29 +function
    7.30 +detextify($file)
    7.31 +{
    7.32 +	$newfile = tempnam(sys_get_temp_dir(), basename(__FILE__).".");
    7.33 +	$cmd = sprintf("%s %s 2>&1 >%s", PDFDETEXTIFY, $file, $newfile);
    7.34 +	system($cmd);
    7.35 +	return $newfile;
    7.36 +}
    7.37 +
    7.38 +
    7.39 +function
    7.40 +concatpdfs($files)
    7.41 +{
    7.42 +	$newfile = sprintf("%s/%s/%s.pdf", dirname(__FILE__), UPLOADDIR,
    7.43 +			date('Y-m-d_H-i-s'));
    7.44 +	$cmd = sprintf("%s %s 2>&1 >%s", PDFCONCAT, implode(' ', $files),
    7.45 +			$newfile);
    7.46 +	system($cmd);
    7.47 +	foreach ($files as $file) {
    7.48 +		unlink($file);
    7.49 +	}
    7.50 +	return sprintf("%s/%s", UPLOADDIR, basename($newfile));
    7.51 +}
    7.52 +
    7.53 +
    7.54 +function
    7.55 +procfiles()
    7.56 +{
    7.57 +	$date = date("Y-m-d H:i:s");
    7.58 +	$ip = $_SERVER['REMOTE_ADDR'];
    7.59 +	$files = array();
    7.60 +	foreach ($_FILES as $key => $val) {
    7.61 +		if ($val['error'] == UPLOAD_ERR_NO_FILE) {
    7.62 +			continue;
    7.63 +		}
    7.64 +		if ($val['error'] > 0) {
    7.65 +			echo "Errors in transferring $val[name]. Skipping.\n";
    7.66 +			echo "($val[error])\n";
    7.67 +			continue;
    7.68 +		}
    7.69 +		if (isset($_POST[$key.'detextify']) && $_POST[$key.'detextify'] == 'on') {
    7.70 +			$files[] = detextify($val['tmp_name']);
    7.71 +		} else {
    7.72 +			$files[] = $val['tmp_name'];
    7.73 +		}
    7.74 +	}
    7.75 +	$newfile = concatpdfs($files);
    7.76 +	// log
    7.77 +	$logmsg = sprintf("[%s] %s creates `%s'\n", $date, $ip, $newfile);
    7.78 +	file_put_contents(LOGFILE, $logmsg, FILE_APPEND);
    7.79 +
    7.80 +	return $newfile;
    7.81 +
    7.82 +}
    7.83 +
    7.84 +
    7.85 +// main()
    7.86 +
    7.87 +if (isset($_POST['submit'])) {
    7.88 +	echo '<pre>';
    7.89 +	$outfile = procfiles();
    7.90 +	echo '</pre>';
    7.91 +	echo '<hr>';
    7.92 +	echo '<h2><a href="'. $outfile .'">The concatenated PDF</a></h2>';
    7.93 +	echo '<hr>';
    7.94 +}
    7.95 +
    7.96 +?>
    7.97 +
    7.98 +
    7.99 +<p>
   7.100 +	This webservice concatenates PDF files and optionally converts their
   7.101 +	text to bitmaps.
   7.102 +</p>
   7.103 +<p>
   7.104 +	The files are stored temporary on the webserver. The detextification
   7.105 +	function modifies them. Use this service only of you have the
   7.106 +	appropriate rights on the files.
   7.107 +</p>
   7.108 +
   7.109 +<form action="<?php echo basename($_SERVER['SCRIPT_NAME']); ?>"
   7.110 +	method="post" enctype="multipart/form-data">
   7.111 +<p>
   7.112 +	<input type="file" name="pdf1" />
   7.113 +	detextify? <input type="checkbox" name="pdf1detextify" />
   7.114 +	<br />
   7.115 +	<input type="file" name="pdf2" />
   7.116 +	detextify? <input type="checkbox" name="pdf2detextify" />
   7.117 +	<br />
   7.118 +	<input type="file" name="pdf3" />
   7.119 +	detextify? <input type="checkbox" name="pdf3detextify" />
   7.120 +	<br />
   7.121 +	<input type="file" name="pdf4" />
   7.122 +	detextify? <input type="checkbox" name="pdf4detextify" />
   7.123 +	<br />
   7.124 +	<input type="file" name="pdf5" />
   7.125 +	detextify? <input type="checkbox" name="pdf5detextify" />
   7.126 +</p>
   7.127 +<p>
   7.128 +	(Maximum file size: <?php echo ini_get('upload_max_filesize'); ?>)
   7.129 +</p>
   7.130 +<p>
   7.131 +	<input type="submit" name="submit" />
   7.132 +</p>
   7.133 +</form>
   7.134 +
   7.135 +</body>
   7.136 +</html>