# HG changeset patch # User markus schnalke # Date 1377777497 -7200 # Node ID 8f7e68d54c6ded5c20fbe0249c3eca6167c80691 initial commit: should be already usable diff -r 000000000000 -r 8f7e68d54c6d .user.ini --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/.user.ini Thu Aug 29 13:58:17 2013 +0200 @@ -0,0 +1,2 @@ +upload_max_filesize = 8M +post_max_size = 8M diff -r 000000000000 -r 8f7e68d54c6d README --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/README Thu Aug 29 13:58:17 2013 +0200 @@ -0,0 +1,53 @@ +pdfconcat +--------- + +This program concatenates PDF files and optionally converts text +within them into bitmaps. The concatenation is done with Ghostscript (gs); +the detextification is done with a gs-conversion to tiff and tiff2pdf +(package libtiff-tools) afterwards. Detextification is only in gray-scale. + + +Contents: + +- bin/pdfconcat shell script to concatenate PDF files +- bin/pdfdetextify shell script to convert text to images in PDF files +- pdfconcat.php web interface to invoke the scripts + +- log log file of web interface invocations (writable) +- bin/monthly shell script to sum up the usage by month + +- upload/ directory to store the converted files (writable) +- bin/cleanup.cron helper script to remove old uploaded files + +- .user.ini php config file to increase the max upload file size + + +Installation: + +You need a Unix system. Ensure you have gs and tiff2pdf (in the package +libtiff-tools) available. + +To use the bin/pdf* tools on the command line only, copy them to your $PATH +and make them executable. + +To set up the program including the web interface, copy the files to a place +below the webserver root. Make bin/pdf{concat,detextify} executable for +www-data. Make the log and the upload directory writable by www-data. Install +a cronjob to clean up the upload directory. Ensure that the max upload file +size of PHP is large enough. + + +More or less helpful information sources on the PDF conversion: + + http://stackoverflow.com/questions/6002261/pdf-to-tiff-imagemagick-problem + http://www.asmail.be/msg0055376363.html + http://kvz.io/blog/2007/11/28/php-tiff2pdf/ + http://phpdave.wordpress.com/tag/php-pdf-to-tiff/ + + +2013, markus schnalke +Written at KIT-Library, Karlsruhe. + +This program is in the public domain. -- Dieses Programm hat nicht die +notwendige Schoepfungshoehe um urheberrechtlich geschuetzt zu sein. If +you though need a license, use it under the CC0 license. diff -r 000000000000 -r 8f7e68d54c6d bin/cleanup.cron --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bin/cleanup.cron Thu Aug 29 13:58:17 2013 +0200 @@ -0,0 +1,14 @@ +#!/bin/sh +# +# print list of old files from the upload directory +# output is meant to be piped into: `xargs rm -f' + +if [ $# -lt 2 ] ; then + echo "usage: ${0##*/} NUM_OF_DAYS_TO_KEEP DIR..." >&2 + exit 1 +fi + +days="$1" +shift + +find "$@" -maxdepth 1 -mindepth 1 -atime +"$days" -print diff -r 000000000000 -r 8f7e68d54c6d bin/monthly-stats --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bin/monthly-stats Thu Aug 29 13:58:17 2013 +0200 @@ -0,0 +1,5 @@ +#!/bin/sh +# +# sum up the usage by month + +sed 's,^\[\(....-..\).*,\1,' "$@" | uniq -c diff -r 000000000000 -r 8f7e68d54c6d bin/pdfconcat --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bin/pdfconcat Thu Aug 29 13:58:17 2013 +0200 @@ -0,0 +1,5 @@ +#!/bin/sh +# +# concatenate the given PDF files to stdout + +gs -q -dNOPAUSE -dBATCH -sPAPERSIZE=a4 -dPDFSETTINGS=/prepress -sDEVICE=pdfwrite -sOutputFile=- "$@" diff -r 000000000000 -r 8f7e68d54c6d bin/pdfdetextify --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/bin/pdfdetextify Thu Aug 29 13:58:17 2013 +0200 @@ -0,0 +1,16 @@ +#!/bin/sh +# +# convert pdf to tiff and back to pdf in order to convert text to image +# writes to stdout +# +# depends on: gs, tiff2pdf (libtiff-tools) + +temp="`mktemp /tmp/${0##*/}.XXXXXX`" +trap 'rm -f "$temp"' 0 1 2 3 15 + +for i do + # echo "processing $i" + gs -q -dNOPAUSE -dBATCH -sPAPERSIZE=a4 -dPDFSETTINGS=/prepress \ + -r300 -o "$temp" -sDEVICE=tiffgray -sCompression=lzw "$i" + tiff2pdf -z "$temp" +done diff -r 000000000000 -r 8f7e68d54c6d pdfconcat.php --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/pdfconcat.php Thu Aug 29 13:58:17 2013 +0200 @@ -0,0 +1,133 @@ + + + +PDF concat and detextify + + + + +

PDF concat and detextify

+ +&1 >%s", PDFDETEXTIFY, $file, $newfile); + system($cmd); + return $newfile; +} + + +function +concatpdfs($files) +{ + $newfile = sprintf("%s/%s/%s.pdf", dirname(__FILE__), UPLOADDIR, + date('Y-m-d_H-i-s')); + $cmd = sprintf("%s %s 2>&1 >%s", PDFCONCAT, implode(' ', $files), + $newfile); + system($cmd); + foreach ($files as $file) { + unlink($file); + } + return sprintf("%s/%s", UPLOADDIR, basename($newfile)); +} + + +function +procfiles() +{ + $date = date("Y-m-d H:i:s"); + $ip = $_SERVER['REMOTE_ADDR']; + $files = array(); + foreach ($_FILES as $key => $val) { + if ($val['error'] == UPLOAD_ERR_NO_FILE) { + continue; + } + if ($val['error'] > 0) { + echo "Errors in transferring $val[name]. Skipping.\n"; + echo "($val[error])\n"; + continue; + } + if (isset($_POST[$key.'detextify']) && $_POST[$key.'detextify'] == 'on') { + $files[] = detextify($val['tmp_name']); + } else { + $files[] = $val['tmp_name']; + } + } + $newfile = concatpdfs($files); + // log + $logmsg = sprintf("[%s] %s creates `%s'\n", $date, $ip, $newfile); + file_put_contents(LOGFILE, $logmsg, FILE_APPEND); + + return $newfile; + +} + + +// main() + +if (isset($_POST['submit'])) { + echo '
';
+	$outfile = procfiles();
+	echo '
'; + echo '
'; + echo '

The concatenated PDF

'; + echo '
'; +} + +?> + + +

+ This webservice concatenates PDF files and optionally converts their + text to bitmaps. +

+

+ The files are stored temporary on the webserver. The detextification + function modifies them. Use this service only of you have the + appropriate rights on the files. +

+ +
+

+ + detextify? +
+ + detextify? +
+ + detextify? +
+ + detextify? +
+ + detextify? +

+

+ (Maximum file size: ) +

+

+ +

+
+ + +