changeset 0:8f7e68d54c6d

initial commit: should be already usable
author markus schnalke <meillo@marmaro.de>
date Thu, 29 Aug 2013 13:58:17 +0200 (2013-08-29)
parents
children 6ea97e3f7cb5
files .user.ini README bin/cleanup.cron bin/monthly-stats bin/pdfconcat bin/pdfdetextify pdfconcat.php
diffstat 7 files changed, 228 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/.user.ini	Thu Aug 29 13:58:17 2013 +0200
@@ -0,0 +1,2 @@
+upload_max_filesize = 8M
+post_max_size = 8M
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/README	Thu Aug 29 13:58:17 2013 +0200
@@ -0,0 +1,53 @@
+pdfconcat
+---------
+
+This program concatenates PDF files and optionally converts text
+within them into bitmaps. The concatenation is done with Ghostscript (gs);
+the detextification is done with a gs-conversion to tiff and tiff2pdf
+(package libtiff-tools) afterwards. Detextification is only in gray-scale.
+
+
+Contents:
+
+- bin/pdfconcat		shell script to concatenate PDF files
+- bin/pdfdetextify	shell script to convert text to images in PDF files
+- pdfconcat.php		web interface to invoke the scripts
+
+- log			log file of web interface invocations (writable)
+- bin/monthly		shell script to sum up the usage by month
+
+- upload/		directory to store the converted files (writable)
+- bin/cleanup.cron	helper script to remove old uploaded files
+
+- .user.ini		php config file to increase the max upload file size
+
+
+Installation:
+
+You need a Unix system. Ensure you have gs and tiff2pdf (in the package
+libtiff-tools) available.
+
+To use the bin/pdf* tools on the command line only, copy them to your $PATH
+and make them executable.
+
+To set up the program including the web interface, copy the files to a place
+below the webserver root. Make bin/pdf{concat,detextify} executable for
+www-data. Make the log and the upload directory writable by www-data. Install
+a cronjob to clean up the upload directory. Ensure that the max upload file
+size of PHP is large enough.
+
+
+More or less helpful information sources on the PDF conversion:
+
+	http://stackoverflow.com/questions/6002261/pdf-to-tiff-imagemagick-problem
+	http://www.asmail.be/msg0055376363.html
+	http://kvz.io/blog/2007/11/28/php-tiff2pdf/
+	http://phpdave.wordpress.com/tag/php-pdf-to-tiff/
+
+
+2013, markus schnalke <meillo@marmaro.de>
+Written at KIT-Library, Karlsruhe.
+
+This program is in the public domain. -- Dieses Programm hat nicht die
+notwendige Schoepfungshoehe um urheberrechtlich geschuetzt zu sein. If
+you though need a license, use it under the CC0 license.
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/bin/cleanup.cron	Thu Aug 29 13:58:17 2013 +0200
@@ -0,0 +1,14 @@
+#!/bin/sh
+#
+# print list of old files from the upload directory
+# output is meant to be piped into: `xargs rm -f'
+
+if [ $# -lt 2 ] ; then
+	echo "usage: ${0##*/} NUM_OF_DAYS_TO_KEEP DIR..." >&2
+	exit 1
+fi
+
+days="$1"
+shift
+
+find "$@" -maxdepth 1 -mindepth 1 -atime +"$days" -print
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/bin/monthly-stats	Thu Aug 29 13:58:17 2013 +0200
@@ -0,0 +1,5 @@
+#!/bin/sh
+#
+# sum up the usage by month
+
+sed 's,^\[\(....-..\).*,\1,' "$@" | uniq -c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/bin/pdfconcat	Thu Aug 29 13:58:17 2013 +0200
@@ -0,0 +1,5 @@
+#!/bin/sh
+#
+# concatenate the given PDF files to stdout
+
+gs -q -dNOPAUSE -dBATCH -sPAPERSIZE=a4 -dPDFSETTINGS=/prepress -sDEVICE=pdfwrite -sOutputFile=- "$@"
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/bin/pdfdetextify	Thu Aug 29 13:58:17 2013 +0200
@@ -0,0 +1,16 @@
+#!/bin/sh
+#
+# convert pdf to tiff and back to pdf in order to convert text to image
+# writes to stdout
+#
+# depends on: gs, tiff2pdf (libtiff-tools)
+
+temp="`mktemp /tmp/${0##*/}.XXXXXX`"
+trap 'rm -f "$temp"' 0 1 2 3 15
+
+for i do
+	# echo "processing $i"
+	gs -q -dNOPAUSE -dBATCH -sPAPERSIZE=a4 -dPDFSETTINGS=/prepress \
+			-r300 -o "$temp" -sDEVICE=tiffgray -sCompression=lzw "$i"
+	tiff2pdf -z "$temp"
+done
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/pdfconcat.php	Thu Aug 29 13:58:17 2013 +0200
@@ -0,0 +1,133 @@
+<!--
+	pdfconcat-0.1
+	Written by markus schnalke <meillo@marmaro.de>,
+		developed at KIT-Library, Karlsruhe.
+	This is free software under the CC0 license.
+	http://marmaro.de/prog/pdfconcat
+-->
+<html>
+<head>
+<title>PDF concat and detextify</title>
+<meta name="author" content="markus schnalke <meillo@marmaro.de>">
+<meta name="copyright" content="No copyright applies.">
+</head>
+<body>
+<h2>PDF concat and detextify</h2>
+
+<?php
+
+define('PDFDETEXTIFY', dirname(__FILE__).'/bin/pdfdetextify');
+define('PDFCONCAT', dirname(__FILE__).'/bin/pdfconcat');
+
+define('LOGFILE', dirname(__FILE__).'/log');
+define('UPLOADDIR', 'upload');
+
+
+function
+detextify($file)
+{
+	$newfile = tempnam(sys_get_temp_dir(), basename(__FILE__).".");
+	$cmd = sprintf("%s %s 2>&1 >%s", PDFDETEXTIFY, $file, $newfile);
+	system($cmd);
+	return $newfile;
+}
+
+
+function
+concatpdfs($files)
+{
+	$newfile = sprintf("%s/%s/%s.pdf", dirname(__FILE__), UPLOADDIR,
+			date('Y-m-d_H-i-s'));
+	$cmd = sprintf("%s %s 2>&1 >%s", PDFCONCAT, implode(' ', $files),
+			$newfile);
+	system($cmd);
+	foreach ($files as $file) {
+		unlink($file);
+	}
+	return sprintf("%s/%s", UPLOADDIR, basename($newfile));
+}
+
+
+function
+procfiles()
+{
+	$date = date("Y-m-d H:i:s");
+	$ip = $_SERVER['REMOTE_ADDR'];
+	$files = array();
+	foreach ($_FILES as $key => $val) {
+		if ($val['error'] == UPLOAD_ERR_NO_FILE) {
+			continue;
+		}
+		if ($val['error'] > 0) {
+			echo "Errors in transferring $val[name]. Skipping.\n";
+			echo "($val[error])\n";
+			continue;
+		}
+		if (isset($_POST[$key.'detextify']) && $_POST[$key.'detextify'] == 'on') {
+			$files[] = detextify($val['tmp_name']);
+		} else {
+			$files[] = $val['tmp_name'];
+		}
+	}
+	$newfile = concatpdfs($files);
+	// log
+	$logmsg = sprintf("[%s] %s creates `%s'\n", $date, $ip, $newfile);
+	file_put_contents(LOGFILE, $logmsg, FILE_APPEND);
+
+	return $newfile;
+
+}
+
+
+// main()
+
+if (isset($_POST['submit'])) {
+	echo '<pre>';
+	$outfile = procfiles();
+	echo '</pre>';
+	echo '<hr>';
+	echo '<h2><a href="'. $outfile .'">The concatenated PDF</a></h2>';
+	echo '<hr>';
+}
+
+?>
+
+
+<p>
+	This webservice concatenates PDF files and optionally converts their
+	text to bitmaps.
+</p>
+<p>
+	The files are stored temporary on the webserver. The detextification
+	function modifies them. Use this service only of you have the
+	appropriate rights on the files.
+</p>
+
+<form action="<?php echo basename($_SERVER['SCRIPT_NAME']); ?>"
+	method="post" enctype="multipart/form-data">
+<p>
+	<input type="file" name="pdf1" />
+	detextify? <input type="checkbox" name="pdf1detextify" />
+	<br />
+	<input type="file" name="pdf2" />
+	detextify? <input type="checkbox" name="pdf2detextify" />
+	<br />
+	<input type="file" name="pdf3" />
+	detextify? <input type="checkbox" name="pdf3detextify" />
+	<br />
+	<input type="file" name="pdf4" />
+	detextify? <input type="checkbox" name="pdf4detextify" />
+	<br />
+	<input type="file" name="pdf5" />
+	detextify? <input type="checkbox" name="pdf5detextify" />
+</p>
+<p>
+	(Maximum file size: <?php echo ini_get('upload_max_filesize'); ?>)
+</p>
+<p>
+	<input type="submit" name="submit" />
+</p>
+</form>
+
+</body>
+</html>