From 51549f3a2701423a7a605465150743c541efe29f Mon Sep 17 00:00:00 2001 From: wangjf2020 Date: Wed, 9 Dec 2020 16:39:19 +0800 Subject: [PATCH] =?UTF-8?q?=E5=A2=9E=E5=8A=A0=E8=87=AA=E5=AE=9A=E4=B9=89JS?= =?UTF-8?q?ON=E6=A0=BC=E5=BC=8F=E8=BE=93=E5=87=BA?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- README.md | 3 +- pom.xml | 105 ++++++++++-------- .../technology/tabula/CommandLineApp.java | 40 +++++-- .../technology/tabula/ObjectExtractor.java | 27 +++-- src/main/java/technology/tabula/Page.java | 20 +++- src/main/java/technology/tabula/Table.java | 9 ++ .../java/technology/tabula/TextStripper.java | 8 +- src/main/java/technology/tabula/Utils.java | 83 +++++++++++++- .../java/technology/tabula/debug/Debug.java | 2 +- .../extractors/BasicExtractionAlgorithm.java | 1 + .../SpreadsheetExtractionAlgorithm.java | 8 +- .../tabula/outobjects/OutTable.java | 33 ++++++ .../tabula/writers/SJSONWriter.java | 99 +++++++++++++++++ .../tabula/TestObjectExtractor.java | 20 ++-- .../technology/tabula/TestTableDetection.java | 2 +- .../technology/tabula/UtilsForTesting.java | 2 +- 16 files changed, 380 insertions(+), 82 deletions(-) create mode 100644 src/main/java/technology/tabula/outobjects/OutTable.java create mode 100644 src/main/java/technology/tabula/writers/SJSONWriter.java diff --git a/README.md b/README.md index 2a08d3ac..58c6e4a5 100644 --- a/README.md +++ b/README.md @@ -37,7 +37,7 @@ Tabula helps you extract tables from PDFs between 0-100 (inclusive) and preceded by '%', input will be taken as % of actual width of the page. Example: --columns %25,50,80.6 - -f,--format Output format: (CSV,TSV,JSON). Default: CSV + -f,--format Output format: (CSV,TSV,JSON,SJSON). Default: CSV -g,--guess Guess the portion of the page to analyze per page. -h,--help Print this help text. @@ -67,6 +67,7 @@ Tabula helps you extract tables from PDFs -u,--use-line-returns Use embedded line returns in cells. (Only in spreadsheet mode.) -v,--version Print version and exit. + -tn, --tableNames 筛选要输出的表 ``` It also includes a debugging tool, run `java -cp ./target/tabula-1.0.2-jar-with-dependencies.jar technology.tabula.debug.Debug -h` for the available options. diff --git a/pom.xml b/pom.xml index b88e7f71..d89d54e1 100644 --- a/pom.xml +++ b/pom.xml @@ -124,26 +124,27 @@ - - org.apache.maven.plugins - maven-gpg-plugin - 1.6 - - - sign-artifacts - verify - - sign - - - - --pinentry-mode - loopback - - - - - + + + + + + + + + + + + + + + + + + + + + maven-compiler-plugin 3.8.1 @@ -159,33 +160,43 @@ technology.tabula.CommandLineApp - - - jar-with-dependencies - + + + jar-with-dependencies + + + + + make-assembly + package + + single + + + + + + org.apache.maven.plugins + maven-surefire-plugin + 2.22.2 + + + -Xms1024m -Xmx2048m + true - - - org.apache.maven.plugins - maven-surefire-plugin - 2.22.2 - - - -Xms1024m -Xmx2048m - - - - - org.apache.maven.plugins - maven-eclipse-plugin - 2.10 - - true - true - - - - + + + + org.apache.maven.plugins + maven-eclipse-plugin + 2.10 + + true + true + + + + @@ -221,6 +232,7 @@ + diff --git a/src/main/java/technology/tabula/CommandLineApp.java b/src/main/java/technology/tabula/CommandLineApp.java index 0228df4b..7922267d 100644 --- a/src/main/java/technology/tabula/CommandLineApp.java +++ b/src/main/java/technology/tabula/CommandLineApp.java @@ -7,7 +7,9 @@ import java.io.IOException; import java.util.ArrayList; import java.util.List; +import java.util.Map; +import javafx.scene.control.Tab; import org.apache.commons.cli.CommandLine; import org.apache.commons.cli.CommandLineParser; import org.apache.commons.cli.HelpFormatter; @@ -17,14 +19,12 @@ import org.apache.commons.cli.DefaultParser; import org.apache.pdfbox.pdmodel.PDDocument; +import org.locationtech.jts.util.StringUtil; import technology.tabula.detectors.DetectionAlgorithm; import technology.tabula.detectors.NurminenDetectionAlgorithm; import technology.tabula.extractors.BasicExtractionAlgorithm; import technology.tabula.extractors.SpreadsheetExtractionAlgorithm; -import technology.tabula.writers.CSVWriter; -import technology.tabula.writers.JSONWriter; -import technology.tabula.writers.TSVWriter; -import technology.tabula.writers.Writer; +import technology.tabula.writers.*; public class CommandLineApp { @@ -44,6 +44,8 @@ public class CommandLineApp { private OutputFormat outputFormat; private String password; private TableExtractor tableExtractor; + private Map> tableMap; + public CommandLineApp(Appendable defaultOutput, CommandLine line) throws ParseException { this.defaultOutput = defaultOutput; @@ -51,6 +53,7 @@ public CommandLineApp(Appendable defaultOutput, CommandLine line) throws ParseEx this.pages = CommandLineApp.whichPages(line); this.outputFormat = CommandLineApp.whichOutputFormat(line); this.tableExtractor = CommandLineApp.createExtractor(line); + this.tableMap = CommandLineApp.whichTableMap(line); if (line.hasOption('s')) { this.password = line.getOptionValue('s'); @@ -160,6 +163,10 @@ private void extractFile(File pdfFile, Appendable outFile) throws ParseException while (pageIterator.hasNext()) { Page page = pageIterator.next(); + if (page == null) { + continue; + } + if (tableExtractor.verticalRulingPositions != null) { for (Float verticalRulingPosition : tableExtractor.verticalRulingPositions) { page.addRuling(new Ruling(0, verticalRulingPosition, 0.0f, (float) page.getHeight())); @@ -195,7 +202,8 @@ private void extractFile(File pdfFile, Appendable outFile) throws ParseException } private PageIterator getPageIterator(PDDocument pdfDocument) throws IOException { - ObjectExtractor extractor = new ObjectExtractor(pdfDocument); + ListtableNames=new ArrayList(tableMap.keySet()); + ObjectExtractor extractor = new ObjectExtractor(pdfDocument, tableNames); return (pages == null) ? extractor.extract() : extractor.extract(pages); @@ -244,9 +252,17 @@ private static List> whichAreas(CommandLine line) throw private static List whichPages(CommandLine line) throws ParseException { String pagesOption = line.hasOption('p') ? line.getOptionValue('p') : "1"; + String tableName = line.hasOption("tn") ? line.getOptionValue("tn") : ""; + if (!"".equals(tableName) && "1".equals(pagesOption)) + pagesOption = "all"; return Utils.parsePagesOption(pagesOption); } + private static Map> whichTableMap(CommandLine line) throws ParseException{ + String pagesOption = line.hasOption("tn") ? line.getOptionValue("tn") : ""; + return Utils.parseTableMapOption(pagesOption); + } + private static ExtractionMethod whichExtractionMethod(CommandLine line) { // -r/--spreadsheet [deprecated; use -l] or -l/--lattice if (line.hasOption('r') || line.hasOption('l')) { @@ -358,7 +374,12 @@ public static Options buildOptions() { .hasArg() .argName("PAGES") .build()); - + o.addOption(Option.builder("tn") + .longOpt("tableNames") + .desc("Comma separated list of TableName, or all. Examples: --tableName table1,table2") + .hasArg() + .argName("TABLENAMES") + .build()); return o; } @@ -462,6 +483,9 @@ private void writeTables(List tables, Appendable out) throws IOException case JSON: writer = new JSONWriter(); break; + case SJSON: + writer = new SJSONWriter(tableMap); + break; case TSV: writer = new TSVWriter(); break; @@ -476,6 +500,7 @@ private String getOutputFilename(File pdfFile) { extension = ".csv"; break; case JSON: + case SJSON: extension = ".json"; break; case TSV: @@ -488,7 +513,8 @@ private String getOutputFilename(File pdfFile) { private enum OutputFormat { CSV, TSV, - JSON; + JSON, + SJSON; static String[] formatNames() { OutputFormat[] values = OutputFormat.values(); diff --git a/src/main/java/technology/tabula/ObjectExtractor.java b/src/main/java/technology/tabula/ObjectExtractor.java index 3998ba6f..5e35dca5 100644 --- a/src/main/java/technology/tabula/ObjectExtractor.java +++ b/src/main/java/technology/tabula/ObjectExtractor.java @@ -1,6 +1,7 @@ package technology.tabula; import java.io.IOException; +import java.util.List; import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.pdmodel.PDPage; @@ -9,8 +10,11 @@ public class ObjectExtractor { private final PDDocument pdfDocument; - public ObjectExtractor(PDDocument pdfDocument) { + private final List tableNames; + + public ObjectExtractor(PDDocument pdfDocument, List tableNames) { this.pdfDocument = pdfDocument; + this.tableNames = tableNames; } protected Page extractPage(Integer pageNumber) throws IOException { @@ -21,17 +25,22 @@ protected Page extractPage(Integer pageNumber) throws IOException { } PDPage p = this.pdfDocument.getPage(pageNumber - 1); - - ObjectExtractorStreamEngine se = new ObjectExtractorStreamEngine(p); - se.processPage(p); - - TextStripper pdfTextStripper = new TextStripper(this.pdfDocument, pageNumber); - pdfTextStripper.process(); - + String tableName = ""; + //TODO 判断表名是否存在 + if (tableNames != null){ + //采用文本包含方式判断表名,后续需优化 + tableName = Utils.findTableName(tableNames, pdfTextStripper.getContent()); + if ("".equals(tableName)) { + return null; + } + } Utils.sort(pdfTextStripper.textElements, Rectangle.ILL_DEFINED_ORDER); + ObjectExtractorStreamEngine se = new ObjectExtractorStreamEngine(p); + se.processPage(p); + float w, h; int pageRotation = p.getRotation(); if (Math.abs(pageRotation) == 90 || Math.abs(pageRotation) == 270) { @@ -43,7 +52,7 @@ protected Page extractPage(Integer pageNumber) throws IOException { } return new Page(0, 0, w, h, pageRotation, pageNumber, p, this.pdfDocument, pdfTextStripper.textElements, - se.rulings, pdfTextStripper.minCharWidth, pdfTextStripper.minCharHeight, pdfTextStripper.spatialIndex); + se.rulings, pdfTextStripper.minCharWidth, pdfTextStripper.minCharHeight, pdfTextStripper.spatialIndex, pdfTextStripper.getContent(), tableName); } public PageIterator extract(Iterable pages) { diff --git a/src/main/java/technology/tabula/Page.java b/src/main/java/technology/tabula/Page.java index 3207bcb9..76a988e5 100644 --- a/src/main/java/technology/tabula/Page.java +++ b/src/main/java/technology/tabula/Page.java @@ -16,6 +16,8 @@ public class Page extends Rectangle { private Integer rotation; private int pageNumber; private List texts; + private String content; + private String tableName; private List rulings, cleanRulings = null, verticalRulingLines = null, horizontalRulingLines = null; private float minCharWidth; private float minCharHeight; @@ -39,7 +41,6 @@ public Page(float top, float left, float width, float height, int rotation, int this.rulings = rulings; } - public Page(float top, float left, float width, float height, int rotation, int page_number, PDPage pdPage, PDDocument doc, List characters, List rulings, float minCharWidth, float minCharHeight, RectangleSpatialIndex index) { @@ -50,6 +51,15 @@ public Page(float top, float left, float width, float height, int rotation, int this.spatial_index = index; } + public Page(float top, float left, float width, float height, int rotation, int page_number, PDPage pdPage, PDDocument doc, + List characters, List rulings, + float minCharWidth, float minCharHeight, RectangleSpatialIndex index, String content, String tableName) { + + this(top, left, width, height, rotation, page_number, pdPage, doc, characters, rulings,minCharHeight, minCharWidth, index); + this.content = content; + this.tableName = tableName; + } + public Page getArea(Rectangle area) { List t = getText(area); float min_char_width = 7; @@ -236,6 +246,14 @@ public PDDocument getPDDoc() { return pdDoc; } + public String getContent() { + return content; + } + + public String getTableName() { + return tableName; + } + /** @deprecated with no replacement */ @Deprecated public RectangleSpatialIndex getSpatialIndex() { return this.spatial_index; diff --git a/src/main/java/technology/tabula/Table.java b/src/main/java/technology/tabula/Table.java index c031c9ed..529ca35a 100644 --- a/src/main/java/technology/tabula/Table.java +++ b/src/main/java/technology/tabula/Table.java @@ -23,12 +23,21 @@ public Table(ExtractionAlgorithm extractionAlgorithm) { private int rowCount = 0; private int colCount = 0; + private String tableName; /* visible for testing */ final TreeMap cells = new TreeMap<>(); public int getRowCount() { return rowCount; } public int getColCount() { return colCount; } + public String getTableName() { + return tableName; + } + + public void setTableName(String tableName) { + this.tableName = tableName; + } + public String getExtractionMethod() { return extractionMethod; } public void add(RectangularTextContainer chunk, int row, int col) { diff --git a/src/main/java/technology/tabula/TextStripper.java b/src/main/java/technology/tabula/TextStripper.java index 329d45a2..fb526ea6 100644 --- a/src/main/java/technology/tabula/TextStripper.java +++ b/src/main/java/technology/tabula/TextStripper.java @@ -25,6 +25,11 @@ public class TextStripper extends PDFTextStripper { public float minCharHeight = Float.MAX_VALUE; public float totalHeight = 0.0f; public int countHeight = 0; + private String content; + + public String getContent() { + return content; + } public TextStripper(PDDocument document, int pageNumber) throws IOException { super(); @@ -36,12 +41,13 @@ public TextStripper(PDDocument document, int pageNumber) throws IOException { } public void process() throws IOException { - this.getText(this.document); + content = this.getText(this.document); } @Override protected void writeString(String string, List textPositions) throws IOException { + super.writeString(string, textPositions); for (TextPosition textPosition: textPositions) { if (textPosition == null) { diff --git a/src/main/java/technology/tabula/Utils.java b/src/main/java/technology/tabula/Utils.java index 00814429..66b5ef00 100644 --- a/src/main/java/technology/tabula/Utils.java +++ b/src/main/java/technology/tabula/Utils.java @@ -8,7 +8,10 @@ import java.io.IOException; import java.math.BigDecimal; import java.util.*; +import java.util.regex.Matcher; +import java.util.regex.Pattern; +import javafx.scene.control.Tab; import org.apache.commons.cli.ParseException; import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.pdmodel.PDPage; @@ -58,7 +61,6 @@ public static Rectangle bounds(Collection shapes) { } return rv; - } // range iterator @@ -117,6 +119,28 @@ public static List> transpose(List> table) { return ret; } + public static Table maxColTable(List tables){ + int colCount = 0; + Table table = null; + for (int i = 0; i< tables.size();i++) { + Table t = tables.get(i); + if (t.getColCount() > colCount || i == 0){ + colCount = t.getColCount(); + table = t; + } + } + return table; + } + + public static boolean isEmptyRow(List rows){ + for(String item: rows){ + if (item != null && !"".equals(item)){ + return false; + } + } + return true; + } + /** * Wrap Collections.sort so we can fallback to a non-stable quicksort if we're * running on JDK7+ @@ -187,6 +211,63 @@ public static List parsePagesOption(String pagesSpec) throws ParseExcep return rv; } + public static Map> parseTableMapOption(String tableNamesSpec) throws ParseException { + if (tableNamesSpec.equals("")) { + return null; + } + Map> rv = new HashMap<>(); + String[] ranges = tableNamesSpec.split(","); + for (int i = 0; i < ranges.length; i++) { + List cols = new ArrayList<>(); + //解析表名和列 + String[] tns = ranges[i].split("\\["); + if ("".equals(tns[0])) + continue; + String tableName = tns[0]; + rv.put(tableName,cols); + List colGroup= findContentByRegex(ranges[i], "\\\\[(.*?)]"); + for(String str: colGroup){ + String[] cns = str.split("|"); + for(String item: cns) cols.add(item); + } + } + return rv; + } + + public static List findContentByRegex(String content, String regex){ + Pattern pattern = Pattern.compile(regex); + Matcher matcher = pattern.matcher(content); + List lstStr = new ArrayList<>(); + while (matcher.find()) { + lstStr.add(matcher.group(1)); + } + return lstStr; + } + + public static boolean isMatch(String content, String regex){ + Pattern pattern = Pattern.compile(regex); + Matcher matcher = pattern.matcher(content); + return matcher.matches(); + } + + public static boolean containTable(List tableNames, String content){ + for (String tableName: tableNames) { + if (content.contains(tableName)){ + return true; + } + } + return false; + } + + public static String findTableName(List tableNames, String content){ + for (String tableName: tableNames) { + if (content.contains(tableName)){ + return tableName; + } + } + return ""; + } + public static void snapPoints(List rulings, float xThreshold, float yThreshold) { // collect points and keep a Line -> p1,p2 map diff --git a/src/main/java/technology/tabula/debug/Debug.java b/src/main/java/technology/tabula/debug/Debug.java index 91609045..0464515d 100644 --- a/src/main/java/technology/tabula/debug/Debug.java +++ b/src/main/java/technology/tabula/debug/Debug.java @@ -217,7 +217,7 @@ public static void renderPage(String pdfPath, String outPath, int pageNumber, Re boolean drawDetectedTables) throws IOException { PDDocument document = PDDocument.load(new File(pdfPath)); - ObjectExtractor oe = new ObjectExtractor(document); + ObjectExtractor oe = new ObjectExtractor(document, null); Page page = oe.extract(pageNumber + 1); diff --git a/src/main/java/technology/tabula/extractors/BasicExtractionAlgorithm.java b/src/main/java/technology/tabula/extractors/BasicExtractionAlgorithm.java index dcd01695..aa4c1c7d 100644 --- a/src/main/java/technology/tabula/extractors/BasicExtractionAlgorithm.java +++ b/src/main/java/technology/tabula/extractors/BasicExtractionAlgorithm.java @@ -65,6 +65,7 @@ public int compare(Ruling arg0, Ruling arg1) { Table table = new Table(this); table.setRect(page.getLeft(), page.getTop(), page.getWidth(), page.getHeight()); + table.setTableName(page.getTableName()); for (int i = 0; i < lines.size(); i++) { Line line = lines.get(i); diff --git a/src/main/java/technology/tabula/extractors/SpreadsheetExtractionAlgorithm.java b/src/main/java/technology/tabula/extractors/SpreadsheetExtractionAlgorithm.java index c377507c..d6a3087e 100644 --- a/src/main/java/technology/tabula/extractors/SpreadsheetExtractionAlgorithm.java +++ b/src/main/java/technology/tabula/extractors/SpreadsheetExtractionAlgorithm.java @@ -132,6 +132,7 @@ else if (r.vertical()) { } TableWithRulingLines t = new TableWithRulingLines(area, overlappingCells, horizontalOverlappingRulings, verticalOverlappingRulings, this); + t.setTableName(page.getTableName()); spreadsheets.add(t); } Utils.sort(spreadsheets, Rectangle.ILL_DEFINED_ORDER); @@ -154,7 +155,8 @@ public boolean isTabular(Page page) { if (tables.size() == 0) { return false; } - Table table = tables.get(0); + //Table table = tables.get(0); + Table table = Utils.maxColTable(tables); int rowsDefinedByLines = table.getRowCount(); int colsDefinedByLines = table.getColCount(); @@ -167,8 +169,8 @@ public boolean isTabular(Page page) { int colsDefinedWithoutLines = table.getColCount(); float ratio = (((float) colsDefinedByLines / colsDefinedWithoutLines) + ((float) rowsDefinedByLines / rowsDefinedWithoutLines)) / 2.0f; - - return ratio > MAGIC_HEURISTIC_NUMBER && ratio < (1/MAGIC_HEURISTIC_NUMBER); + //&& ratio < (1/MAGIC_HEURISTIC_NUMBER) + return ratio > MAGIC_HEURISTIC_NUMBER ; } public static List findCells(List horizontalRulingLines, List verticalRulingLines) { diff --git a/src/main/java/technology/tabula/outobjects/OutTable.java b/src/main/java/technology/tabula/outobjects/OutTable.java new file mode 100644 index 00000000..cd2fd42a --- /dev/null +++ b/src/main/java/technology/tabula/outobjects/OutTable.java @@ -0,0 +1,33 @@ +package technology.tabula.outobjects; + +import java.util.List; + +public class OutTable { + private String name; + private List column; + private List> data; + + public String getName() { + return name; + } + + public void setName(String name) { + this.name = name; + } + + public List getColumn() { + return column; + } + + public void setColumn(List column) { + this.column = column; + } + + public List> getData() { + return data; + } + + public void setData(List> data) { + this.data = data; + } +} diff --git a/src/main/java/technology/tabula/writers/SJSONWriter.java b/src/main/java/technology/tabula/writers/SJSONWriter.java new file mode 100644 index 00000000..72e89989 --- /dev/null +++ b/src/main/java/technology/tabula/writers/SJSONWriter.java @@ -0,0 +1,99 @@ +package technology.tabula.writers; + +import com.google.gson.*; +import technology.tabula.*; +import technology.tabula.json.RectangularTextContainerSerializer; +import technology.tabula.json.TableSerializer; +import technology.tabula.outobjects.OutTable; + +import java.io.IOException; +import java.lang.reflect.Modifier; +import java.util.*; + +public class SJSONWriter implements Writer { + + private Map> tableMap; + private static final ExclusionStrategy ALLCLASSES_SKIPNONPUBLIC = new ExclusionStrategy() { + @Override public boolean shouldSkipClass(Class c) { return false; } + @Override public boolean shouldSkipField(FieldAttributes fa) { return !fa.hasModifier(Modifier.PUBLIC); } + }; + + public SJSONWriter(Map> tableMap){ + this.tableMap = tableMap; + } + + @Override + public void write(Appendable out, Table table) throws IOException { + write(out, Collections.singletonList(table)); + } + + @Override + public void write(Appendable out, List
tables) throws IOException { + Gson gson = gson(); + Map outTableMap = new HashMap<>(); + OutTable outTable = null; + for (Table table : tables) { + if (table.getRowCount() > 0){ + String tableName = table.getTableName(); + if (outTableMap.containsKey(tableName)) { + outTable = outTableMap.get(tableName); + } + else { + outTable = new OutTable(); + outTable.setName(tableName); + outTable.setColumn(new ArrayList<>()); + outTable.setData(new ArrayList<>()); + outTableMap.put(tableName, outTable); + } + int dataRow = 0; + //查找列的位置及数据开始位置 + /* + Map colPos = null; + if (tableMap != null) { + List cols = tableMap.get(tableName); + for(String item: cols){ + colPos.put(item, null); + } + } + */ + + /* + if (colPos != null) { + for (int i = 0; i < table.getRows().size(); i++) { + List row = table.getRows().get(i); + for (int j = 0; j< row.size(); j++) { + RectangularTextContainer tc = row.get(j); + for (String key : colPos.keySet()) { + if (colPos.get(key) != null && Utils.isMatch(tc.getText(),key)){ + outTable.getColumn().add(tc.getText()); + colPos.put(key, Integer.valueOf(j)); + dataRow = i; + } + } + } + } + } + */ + for(int i = dataRow; i< table.getRows().size(); i++){ + List row = table.getRows().get(i); + List cells = new ArrayList<>(row.size()); + for (RectangularTextContainer tc : row) { + cells.add(tc.getText()); + } + if (!Utils.isEmptyRow(cells)) outTable.getData().add(cells); + } + } + } + JsonArray array = new JsonArray(); + for (Map.Entry m : outTableMap.entrySet()) { + array.add(gson.toJsonTree(m.getValue(), OutTable.class)); + } + out.append(gson.toJson(array)); + } + + private static Gson gson() { + return new GsonBuilder() + .create(); + } + +} diff --git a/src/test/java/technology/tabula/TestObjectExtractor.java b/src/test/java/technology/tabula/TestObjectExtractor.java index fe458b87..dda1ffdd 100644 --- a/src/test/java/technology/tabula/TestObjectExtractor.java +++ b/src/test/java/technology/tabula/TestObjectExtractor.java @@ -22,14 +22,14 @@ public void testWrongPasswordRaisesException() throws IOException { @Test(expected = IOException.class) public void testEmptyOnEncryptedFileRaisesException() throws IOException { PDDocument pdf_document = PDDocument.load(new File("src/test/resources/technology/tabula/encrypted.pdf")); - ObjectExtractor oe = new ObjectExtractor(pdf_document); + ObjectExtractor oe = new ObjectExtractor(pdf_document, null); oe.extract().next(); } @Test public void testCanReadPDFWithOwnerEncryption() throws IOException { PDDocument pdf_document = PDDocument.load(new File("src/test/resources/technology/tabula/S2MNCEbirdisland.pdf")); - ObjectExtractor oe = new ObjectExtractor(pdf_document); + ObjectExtractor oe = new ObjectExtractor(pdf_document,null); PageIterator pi = oe.extract(); int i = 0; while (pi.hasNext()) { @@ -43,7 +43,7 @@ public void testCanReadPDFWithOwnerEncryption() throws IOException { @Test public void testGoodPassword() throws IOException { PDDocument pdf_document = PDDocument.load(new File("src/test/resources/technology/tabula/encrypted.pdf"), "userpassword"); - ObjectExtractor oe = new ObjectExtractor(pdf_document); + ObjectExtractor oe = new ObjectExtractor(pdf_document, null); List pages = new ArrayList<>(); PageIterator pi = oe.extract(); while (pi.hasNext()) { @@ -56,7 +56,7 @@ public void testGoodPassword() throws IOException { @Test public void testTextExtractionDoesNotRaise() throws IOException { PDDocument pdf_document = PDDocument.load(new File("src/test/resources/technology/tabula/rotated_page.pdf")); - ObjectExtractor oe = new ObjectExtractor(pdf_document); + ObjectExtractor oe = new ObjectExtractor(pdf_document, null); PageIterator pi = oe.extract(); assertTrue(pi.hasNext()); @@ -68,7 +68,7 @@ public void testTextExtractionDoesNotRaise() throws IOException { @Test public void testShouldDetectRulings() throws IOException { PDDocument pdf_document = PDDocument.load(new File("src/test/resources/technology/tabula/should_detect_rulings.pdf")); - ObjectExtractor oe = new ObjectExtractor(pdf_document); + ObjectExtractor oe = new ObjectExtractor(pdf_document, null); PageIterator pi = oe.extract(); Page page = pi.next(); @@ -82,7 +82,7 @@ public void testShouldDetectRulings() throws IOException { @Test public void testDontThrowNPEInShfill() throws IOException { PDDocument pdf_document = PDDocument.load(new File("src/test/resources/technology/tabula/labor.pdf")); - ObjectExtractor oe = new ObjectExtractor(pdf_document); + ObjectExtractor oe = new ObjectExtractor(pdf_document, null); PageIterator pi = oe.extract(); assertTrue(pi.hasNext()); try { @@ -98,7 +98,7 @@ public void testExtractOnePage() throws IOException { PDDocument pdf_document = PDDocument.load(new File("src/test/resources/technology/tabula/S2MNCEbirdisland.pdf")); assertEquals(2, pdf_document.getNumberOfPages()); - ObjectExtractor oe = new ObjectExtractor(pdf_document); + ObjectExtractor oe = new ObjectExtractor(pdf_document, null); Page page = oe.extract(2); assertNotNull(page); @@ -110,7 +110,7 @@ public void testExtractWrongPageNumber() throws IOException { PDDocument pdf_document = PDDocument.load(new File("src/test/resources/technology/tabula/S2MNCEbirdisland.pdf")); assertEquals(2, pdf_document.getNumberOfPages()); - ObjectExtractor oe = new ObjectExtractor(pdf_document); + ObjectExtractor oe = new ObjectExtractor(pdf_document, null); oe.extract(3); } @@ -118,7 +118,7 @@ public void testExtractWrongPageNumber() throws IOException { @Test public void testTextElementsContainedInPage() throws IOException { PDDocument pdf_document = PDDocument.load(new File("src/test/resources/technology/tabula/cs-en-us-pbms.pdf")); - ObjectExtractor oe = new ObjectExtractor(pdf_document); + ObjectExtractor oe = new ObjectExtractor(pdf_document, null); Page page = oe.extractPage(1); @@ -129,7 +129,7 @@ public void testTextElementsContainedInPage() throws IOException { @Test public void testDoNotNPEInPointComparator() throws IOException { PDDocument pdf_document = PDDocument.load(new File("src/test/resources/technology/tabula/npe_issue_206.pdf")); - ObjectExtractor oe = new ObjectExtractor(pdf_document); + ObjectExtractor oe = new ObjectExtractor(pdf_document, null); try { Page p = oe.extractPage(1); diff --git a/src/test/java/technology/tabula/TestTableDetection.java b/src/test/java/technology/tabula/TestTableDetection.java index 6e58f6a4..3efdf2d8 100644 --- a/src/test/java/technology/tabula/TestTableDetection.java +++ b/src/test/java/technology/tabula/TestTableDetection.java @@ -163,7 +163,7 @@ public void testDetectionOfTables() throws Exception { // tabula extractors PDDocument pdfDocument = PDDocument.load(this.pdf); - ObjectExtractor extractor = new ObjectExtractor(pdfDocument); + ObjectExtractor extractor = new ObjectExtractor(pdfDocument, null); // parse expected tables from the ground truth dataset Map> expectedTables = new HashMap<>(); diff --git a/src/test/java/technology/tabula/UtilsForTesting.java b/src/test/java/technology/tabula/UtilsForTesting.java index 3ee8efde..6e82b4b5 100644 --- a/src/test/java/technology/tabula/UtilsForTesting.java +++ b/src/test/java/technology/tabula/UtilsForTesting.java @@ -25,7 +25,7 @@ public static Page getPage(String path, int pageNumber) throws IOException { try { PDDocument document = PDDocument .load(new File(path)); - oe = new ObjectExtractor(document); + oe = new ObjectExtractor(document, null); Page page = oe.extract(pageNumber); return page; } finally {