Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file renamed tika-app-1.0.jar → 1.1/tika-app-1.1.jar
Binary file not shown.
Binary file renamed tika-core-1.0.jar → 1.1/tika-core-1.1.jar
Binary file not shown.
Binary file renamed tika-parsers-1.0.jar → 1.1/tika-parsers-1.1.jar
Binary file not shown.
2 changes: 2 additions & 0 deletions AUTHORS
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
Chris Wilson (http://github.com/qris)
Sudharshan S (http://github.com/sudharsh)
11 changes: 11 additions & 0 deletions ChangeLog
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
2012-07-14 Sudharshan S <sudharsh@gmail.com>

* v1.1: Bump up tika version to 1.1
Reorganize directories.
* setup.py: StringBufferInputStream with ByteArrayInputStream.
StringBufferInputStream is deprecated and didn't work with raw binary strings.
* parser: Add a parser module with from_file and from_buffer functions




38 changes: 38 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
# python-tika - Python bindings for Apache Tika

## Requirements

* Java >= 1.5
* [JCC](http://lucene.apache.org/pylucene/jcc/index.html)

## Installation
$ python setup.py build
$ python setup.py install

## Usage

To use the `AutoDetectParser`,

import tika
tika.initVM()

from tika import parser

print parser.from_buffer("<html><body>Hello World</body></html>")
# Or directly from a file,
# print parser.from_file("/tmp/foo.doc")

returns a `dict`,

{'content': u'Hello Cruel World',
'metadata': {u'Content-Encoding': u'ISO-8859-1',
u'Content-Type': u'text/html',
u'title': u'Hello world'}
}







File renamed without changes.
File renamed without changes.
File renamed without changes.
30 changes: 30 additions & 0 deletions parser/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
import tika

def from_file(filename):
"""Parse filename using tika's AutoDetectParser."""
stream = tika.FileInputStream(tika.File(filename))
return __parse(stream)

def from_buffer(string):
"""Parse raw binary string using tika's AutoDetectParser."""
stream = tika.ByteArrayInputStream(tika.JArray_byte(string))
return __parse(stream)

def __parse(stream):
parsed = {}
parser = tika.AutoDetectParser()
content = tika.BodyContentHandler()
metadata = tika.Metadata()
context = tika.ParseContext()
parser.parse(stream, content, metadata, context)
parsed["content"] = content.toString()
parsed["metadata"] = {}
for n in metadata.names():
parsed["metadata"][n] = metadata.get(n)
return parsed






12 changes: 6 additions & 6 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,15 +3,15 @@
from jcc import cpp

options = {
'include': ('org.eclipse.osgi.jar', 'tika-app-1.0.jar',
'log4j.properties.jar'),
'jar': ('tika-parsers-1.0.jar', 'tika-core-1.0.jar',),
'include': ('lib/org.eclipse.osgi.jar', '1.1/tika-app-1.1.jar',
'lib/log4j.properties.jar'),
'jar': ('1.1/tika-parsers-1.1.jar', '1.1/tika-core-1.1.jar',),
'package': ('org.xml.sax',),
'python': 'tika',
'version': '1.0',
'version': '1.1',
'module': 'parser',
'reserved': ('asm',),
'classes': ('java.io.File', 'java.io.FileInputStream',
'java.io.StringBufferInputStream'),
'classes': ('java.io.File', 'java.io.FileInputStream', 'java.io.ByteArrayInputStream'),
}

import sys
Expand Down
Binary file removed tika-icon-large.xcf
Binary file not shown.
Binary file removed tika-icons.ico
Binary file not shown.
Binary file removed tika-icons.xcf
Binary file not shown.
Binary file removed tika-logo.png
Binary file not shown.
Binary file removed tika-server-1.1-SNAPSHOT.jar
Binary file not shown.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.