pdfconcat
changeset 0:8f7e68d54c6d
initial commit: should be already usable
author | markus schnalke <meillo@marmaro.de> |
---|---|
date | Thu, 29 Aug 2013 13:58:17 +0200 |
parents | |
children | 6ea97e3f7cb5 |
files | .user.ini README bin/cleanup.cron bin/monthly-stats bin/pdfconcat bin/pdfdetextify pdfconcat.php |
diffstat | 7 files changed, 228 insertions(+), 0 deletions(-) [+] |
line diff
1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/.user.ini Thu Aug 29 13:58:17 2013 +0200 1.3 @@ -0,0 +1,2 @@ 1.4 +upload_max_filesize = 8M 1.5 +post_max_size = 8M
2.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 2.2 +++ b/README Thu Aug 29 13:58:17 2013 +0200 2.3 @@ -0,0 +1,53 @@ 2.4 +pdfconcat 2.5 +--------- 2.6 + 2.7 +This program concatenates PDF files and optionally converts text 2.8 +within them into bitmaps. The concatenation is done with Ghostscript (gs); 2.9 +the detextification is done with a gs-conversion to tiff and tiff2pdf 2.10 +(package libtiff-tools) afterwards. Detextification is only in gray-scale. 2.11 + 2.12 + 2.13 +Contents: 2.14 + 2.15 +- bin/pdfconcat shell script to concatenate PDF files 2.16 +- bin/pdfdetextify shell script to convert text to images in PDF files 2.17 +- pdfconcat.php web interface to invoke the scripts 2.18 + 2.19 +- log log file of web interface invocations (writable) 2.20 +- bin/monthly shell script to sum up the usage by month 2.21 + 2.22 +- upload/ directory to store the converted files (writable) 2.23 +- bin/cleanup.cron helper script to remove old uploaded files 2.24 + 2.25 +- .user.ini php config file to increase the max upload file size 2.26 + 2.27 + 2.28 +Installation: 2.29 + 2.30 +You need a Unix system. Ensure you have gs and tiff2pdf (in the package 2.31 +libtiff-tools) available. 2.32 + 2.33 +To use the bin/pdf* tools on the command line only, copy them to your $PATH 2.34 +and make them executable. 2.35 + 2.36 +To set up the program including the web interface, copy the files to a place 2.37 +below the webserver root. Make bin/pdf{concat,detextify} executable for 2.38 +www-data. Make the log and the upload directory writable by www-data. Install 2.39 +a cronjob to clean up the upload directory. Ensure that the max upload file 2.40 +size of PHP is large enough. 2.41 + 2.42 + 2.43 +More or less helpful information sources on the PDF conversion: 2.44 + 2.45 + http://stackoverflow.com/questions/6002261/pdf-to-tiff-imagemagick-problem 2.46 + http://www.asmail.be/msg0055376363.html 2.47 + http://kvz.io/blog/2007/11/28/php-tiff2pdf/ 2.48 + http://phpdave.wordpress.com/tag/php-pdf-to-tiff/ 2.49 + 2.50 + 2.51 +2013, markus schnalke <meillo@marmaro.de> 2.52 +Written at KIT-Library, Karlsruhe. 2.53 + 2.54 +This program is in the public domain. -- Dieses Programm hat nicht die 2.55 +notwendige Schoepfungshoehe um urheberrechtlich geschuetzt zu sein. If 2.56 +you though need a license, use it under the CC0 license.
3.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 3.2 +++ b/bin/cleanup.cron Thu Aug 29 13:58:17 2013 +0200 3.3 @@ -0,0 +1,14 @@ 3.4 +#!/bin/sh 3.5 +# 3.6 +# print list of old files from the upload directory 3.7 +# output is meant to be piped into: `xargs rm -f' 3.8 + 3.9 +if [ $# -lt 2 ] ; then 3.10 + echo "usage: ${0##*/} NUM_OF_DAYS_TO_KEEP DIR..." >&2 3.11 + exit 1 3.12 +fi 3.13 + 3.14 +days="$1" 3.15 +shift 3.16 + 3.17 +find "$@" -maxdepth 1 -mindepth 1 -atime +"$days" -print
4.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 4.2 +++ b/bin/monthly-stats Thu Aug 29 13:58:17 2013 +0200 4.3 @@ -0,0 +1,5 @@ 4.4 +#!/bin/sh 4.5 +# 4.6 +# sum up the usage by month 4.7 + 4.8 +sed 's,^\[\(....-..\).*,\1,' "$@" | uniq -c
5.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 5.2 +++ b/bin/pdfconcat Thu Aug 29 13:58:17 2013 +0200 5.3 @@ -0,0 +1,5 @@ 5.4 +#!/bin/sh 5.5 +# 5.6 +# concatenate the given PDF files to stdout 5.7 + 5.8 +gs -q -dNOPAUSE -dBATCH -sPAPERSIZE=a4 -dPDFSETTINGS=/prepress -sDEVICE=pdfwrite -sOutputFile=- "$@"
6.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 6.2 +++ b/bin/pdfdetextify Thu Aug 29 13:58:17 2013 +0200 6.3 @@ -0,0 +1,16 @@ 6.4 +#!/bin/sh 6.5 +# 6.6 +# convert pdf to tiff and back to pdf in order to convert text to image 6.7 +# writes to stdout 6.8 +# 6.9 +# depends on: gs, tiff2pdf (libtiff-tools) 6.10 + 6.11 +temp="`mktemp /tmp/${0##*/}.XXXXXX`" 6.12 +trap 'rm -f "$temp"' 0 1 2 3 15 6.13 + 6.14 +for i do 6.15 + # echo "processing $i" 6.16 + gs -q -dNOPAUSE -dBATCH -sPAPERSIZE=a4 -dPDFSETTINGS=/prepress \ 6.17 + -r300 -o "$temp" -sDEVICE=tiffgray -sCompression=lzw "$i" 6.18 + tiff2pdf -z "$temp" 6.19 +done
7.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 7.2 +++ b/pdfconcat.php Thu Aug 29 13:58:17 2013 +0200 7.3 @@ -0,0 +1,133 @@ 7.4 +<!-- 7.5 + pdfconcat-0.1 7.6 + Written by markus schnalke <meillo@marmaro.de>, 7.7 + developed at KIT-Library, Karlsruhe. 7.8 + This is free software under the CC0 license. 7.9 + http://marmaro.de/prog/pdfconcat 7.10 +--> 7.11 +<html> 7.12 +<head> 7.13 +<title>PDF concat and detextify</title> 7.14 +<meta name="author" content="markus schnalke <meillo@marmaro.de>"> 7.15 +<meta name="copyright" content="No copyright applies."> 7.16 +</head> 7.17 +<body> 7.18 +<h2>PDF concat and detextify</h2> 7.19 + 7.20 +<?php 7.21 + 7.22 +define('PDFDETEXTIFY', dirname(__FILE__).'/bin/pdfdetextify'); 7.23 +define('PDFCONCAT', dirname(__FILE__).'/bin/pdfconcat'); 7.24 + 7.25 +define('LOGFILE', dirname(__FILE__).'/log'); 7.26 +define('UPLOADDIR', 'upload'); 7.27 + 7.28 + 7.29 +function 7.30 +detextify($file) 7.31 +{ 7.32 + $newfile = tempnam(sys_get_temp_dir(), basename(__FILE__)."."); 7.33 + $cmd = sprintf("%s %s 2>&1 >%s", PDFDETEXTIFY, $file, $newfile); 7.34 + system($cmd); 7.35 + return $newfile; 7.36 +} 7.37 + 7.38 + 7.39 +function 7.40 +concatpdfs($files) 7.41 +{ 7.42 + $newfile = sprintf("%s/%s/%s.pdf", dirname(__FILE__), UPLOADDIR, 7.43 + date('Y-m-d_H-i-s')); 7.44 + $cmd = sprintf("%s %s 2>&1 >%s", PDFCONCAT, implode(' ', $files), 7.45 + $newfile); 7.46 + system($cmd); 7.47 + foreach ($files as $file) { 7.48 + unlink($file); 7.49 + } 7.50 + return sprintf("%s/%s", UPLOADDIR, basename($newfile)); 7.51 +} 7.52 + 7.53 + 7.54 +function 7.55 +procfiles() 7.56 +{ 7.57 + $date = date("Y-m-d H:i:s"); 7.58 + $ip = $_SERVER['REMOTE_ADDR']; 7.59 + $files = array(); 7.60 + foreach ($_FILES as $key => $val) { 7.61 + if ($val['error'] == UPLOAD_ERR_NO_FILE) { 7.62 + continue; 7.63 + } 7.64 + if ($val['error'] > 0) { 7.65 + echo "Errors in transferring $val[name]. Skipping.\n"; 7.66 + echo "($val[error])\n"; 7.67 + continue; 7.68 + } 7.69 + if (isset($_POST[$key.'detextify']) && $_POST[$key.'detextify'] == 'on') { 7.70 + $files[] = detextify($val['tmp_name']); 7.71 + } else { 7.72 + $files[] = $val['tmp_name']; 7.73 + } 7.74 + } 7.75 + $newfile = concatpdfs($files); 7.76 + // log 7.77 + $logmsg = sprintf("[%s] %s creates `%s'\n", $date, $ip, $newfile); 7.78 + file_put_contents(LOGFILE, $logmsg, FILE_APPEND); 7.79 + 7.80 + return $newfile; 7.81 + 7.82 +} 7.83 + 7.84 + 7.85 +// main() 7.86 + 7.87 +if (isset($_POST['submit'])) { 7.88 + echo '<pre>'; 7.89 + $outfile = procfiles(); 7.90 + echo '</pre>'; 7.91 + echo '<hr>'; 7.92 + echo '<h2><a href="'. $outfile .'">The concatenated PDF</a></h2>'; 7.93 + echo '<hr>'; 7.94 +} 7.95 + 7.96 +?> 7.97 + 7.98 + 7.99 +<p> 7.100 + This webservice concatenates PDF files and optionally converts their 7.101 + text to bitmaps. 7.102 +</p> 7.103 +<p> 7.104 + The files are stored temporary on the webserver. The detextification 7.105 + function modifies them. Use this service only of you have the 7.106 + appropriate rights on the files. 7.107 +</p> 7.108 + 7.109 +<form action="<?php echo basename($_SERVER['SCRIPT_NAME']); ?>" 7.110 + method="post" enctype="multipart/form-data"> 7.111 +<p> 7.112 + <input type="file" name="pdf1" /> 7.113 + detextify? <input type="checkbox" name="pdf1detextify" /> 7.114 + <br /> 7.115 + <input type="file" name="pdf2" /> 7.116 + detextify? <input type="checkbox" name="pdf2detextify" /> 7.117 + <br /> 7.118 + <input type="file" name="pdf3" /> 7.119 + detextify? <input type="checkbox" name="pdf3detextify" /> 7.120 + <br /> 7.121 + <input type="file" name="pdf4" /> 7.122 + detextify? <input type="checkbox" name="pdf4detextify" /> 7.123 + <br /> 7.124 + <input type="file" name="pdf5" /> 7.125 + detextify? <input type="checkbox" name="pdf5detextify" /> 7.126 +</p> 7.127 +<p> 7.128 + (Maximum file size: <?php echo ini_get('upload_max_filesize'); ?>) 7.129 +</p> 7.130 +<p> 7.131 + <input type="submit" name="submit" /> 7.132 +</p> 7.133 +</form> 7.134 + 7.135 +</body> 7.136 +</html>