diff --git a/tika-app-1.0.jar b/1.1/tika-app-1.1.jar similarity index 86% rename from tika-app-1.0.jar rename to 1.1/tika-app-1.1.jar index 482737c..0682948 100644 Binary files a/tika-app-1.0.jar and b/1.1/tika-app-1.1.jar differ diff --git a/tika-core-1.0.jar b/1.1/tika-core-1.1.jar similarity index 61% rename from tika-core-1.0.jar rename to 1.1/tika-core-1.1.jar index 7e76da2..7ad2be6 100644 Binary files a/tika-core-1.0.jar and b/1.1/tika-core-1.1.jar differ diff --git a/tika-parsers-1.0.jar b/1.1/tika-parsers-1.1.jar similarity index 53% rename from tika-parsers-1.0.jar rename to 1.1/tika-parsers-1.1.jar index 70d46ac..ad82942 100644 Binary files a/tika-parsers-1.0.jar and b/1.1/tika-parsers-1.1.jar differ diff --git a/AUTHORS b/AUTHORS new file mode 100644 index 0000000..adf54e7 --- /dev/null +++ b/AUTHORS @@ -0,0 +1,2 @@ +Chris Wilson (http://github.com/qris) +Sudharshan S (http://github.com/sudharsh) \ No newline at end of file diff --git a/ChangeLog b/ChangeLog new file mode 100644 index 0000000..a18b89d --- /dev/null +++ b/ChangeLog @@ -0,0 +1,11 @@ +2012-07-14 Sudharshan S + + * v1.1: Bump up tika version to 1.1 + Reorganize directories. + * setup.py: StringBufferInputStream with ByteArrayInputStream. + StringBufferInputStream is deprecated and didn't work with raw binary strings. + * parser: Add a parser module with from_file and from_buffer functions + + + + diff --git a/README.md b/README.md new file mode 100644 index 0000000..b560e3b --- /dev/null +++ b/README.md @@ -0,0 +1,38 @@ +# python-tika - Python bindings for Apache Tika + +## Requirements + +* Java >= 1.5 +* [JCC](http://lucene.apache.org/pylucene/jcc/index.html) + +## Installation + $ python setup.py build + $ python setup.py install + +## Usage + +To use the `AutoDetectParser`, + + import tika + tika.initVM() + + from tika import parser + + print parser.from_buffer("Hello World") + # Or directly from a file, + # print parser.from_file("/tmp/foo.doc") + +returns a `dict`, + + {'content': u'Hello Cruel World', + 'metadata': {u'Content-Encoding': u'ISO-8859-1', + u'Content-Type': u'text/html', + u'title': u'Hello world'} + } + + + + + + + diff --git a/log4j.properties b/lib/log4j.properties similarity index 100% rename from log4j.properties rename to lib/log4j.properties diff --git a/log4j.properties.jar b/lib/log4j.properties.jar similarity index 100% rename from log4j.properties.jar rename to lib/log4j.properties.jar diff --git a/org.eclipse.osgi.jar b/lib/org.eclipse.osgi.jar similarity index 100% rename from org.eclipse.osgi.jar rename to lib/org.eclipse.osgi.jar diff --git a/parser/__init__.py b/parser/__init__.py new file mode 100644 index 0000000..d8d59e2 --- /dev/null +++ b/parser/__init__.py @@ -0,0 +1,30 @@ +import tika + +def from_file(filename): + """Parse filename using tika's AutoDetectParser.""" + stream = tika.FileInputStream(tika.File(filename)) + return __parse(stream) + +def from_buffer(string): + """Parse raw binary string using tika's AutoDetectParser.""" + stream = tika.ByteArrayInputStream(tika.JArray_byte(string)) + return __parse(stream) + +def __parse(stream): + parsed = {} + parser = tika.AutoDetectParser() + content = tika.BodyContentHandler() + metadata = tika.Metadata() + context = tika.ParseContext() + parser.parse(stream, content, metadata, context) + parsed["content"] = content.toString() + parsed["metadata"] = {} + for n in metadata.names(): + parsed["metadata"][n] = metadata.get(n) + return parsed + + + + + + diff --git a/setup.py b/setup.py index afd1f5d..93d61e0 100644 --- a/setup.py +++ b/setup.py @@ -3,15 +3,15 @@ from jcc import cpp options = { - 'include': ('org.eclipse.osgi.jar', 'tika-app-1.0.jar', - 'log4j.properties.jar'), - 'jar': ('tika-parsers-1.0.jar', 'tika-core-1.0.jar',), + 'include': ('lib/org.eclipse.osgi.jar', '1.1/tika-app-1.1.jar', + 'lib/log4j.properties.jar'), + 'jar': ('1.1/tika-parsers-1.1.jar', '1.1/tika-core-1.1.jar',), 'package': ('org.xml.sax',), 'python': 'tika', - 'version': '1.0', + 'version': '1.1', + 'module': 'parser', 'reserved': ('asm',), - 'classes': ('java.io.File', 'java.io.FileInputStream', - 'java.io.StringBufferInputStream'), + 'classes': ('java.io.File', 'java.io.FileInputStream', 'java.io.ByteArrayInputStream'), } import sys diff --git a/tika-icon-large.xcf b/tika-icon-large.xcf deleted file mode 100644 index 1a92734..0000000 Binary files a/tika-icon-large.xcf and /dev/null differ diff --git a/tika-icons.ico b/tika-icons.ico deleted file mode 100644 index a538e01..0000000 Binary files a/tika-icons.ico and /dev/null differ diff --git a/tika-icons.xcf b/tika-icons.xcf deleted file mode 100644 index 1d538d4..0000000 Binary files a/tika-icons.xcf and /dev/null differ diff --git a/tika-logo.png b/tika-logo.png deleted file mode 100644 index d3487c5..0000000 Binary files a/tika-logo.png and /dev/null differ diff --git a/tika-server-1.1-SNAPSHOT.jar b/tika-server-1.1-SNAPSHOT.jar deleted file mode 100644 index 4b1ad35..0000000 Binary files a/tika-server-1.1-SNAPSHOT.jar and /dev/null differ diff --git a/Tika-WinRun4J.exe b/win/Tika-WinRun4J.exe similarity index 100% rename from Tika-WinRun4J.exe rename to win/Tika-WinRun4J.exe diff --git a/Tika-WinRun4J.ini b/win/Tika-WinRun4J.ini similarity index 100% rename from Tika-WinRun4J.ini rename to win/Tika-WinRun4J.ini diff --git a/Tika-WinRun4J.jar b/win/Tika-WinRun4J.jar similarity index 100% rename from Tika-WinRun4J.jar rename to win/Tika-WinRun4J.jar diff --git a/TikaServiceWinRun4J.jar b/win/TikaServiceWinRun4J.jar similarity index 100% rename from TikaServiceWinRun4J.jar rename to win/TikaServiceWinRun4J.jar diff --git a/TikaServiceWinRun4J.java b/win/TikaServiceWinRun4J.java similarity index 100% rename from TikaServiceWinRun4J.java rename to win/TikaServiceWinRun4J.java diff --git a/WinRun4J.jar b/win/WinRun4J.jar similarity index 100% rename from WinRun4J.jar rename to win/WinRun4J.jar diff --git a/winrun4J-0.4.4.zip b/win/winrun4J-0.4.4.zip similarity index 100% rename from winrun4J-0.4.4.zip rename to win/winrun4J-0.4.4.zip