From 51549f3a2701423a7a605465150743c541efe29f Mon Sep 17 00:00:00 2001
From: wangjf2020 <xijuewang@163.com>
Date: Wed, 9 Dec 2020 16:39:19 +0800
Subject: [PATCH] =?UTF-8?q?=E5=A2=9E=E5=8A=A0=E8=87=AA=E5=AE=9A=E4=B9=89JS?=
 =?UTF-8?q?ON=E6=A0=BC=E5=BC=8F=E8=BE=93=E5=87=BA?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 README.md                                     |   3 +-
 pom.xml                                       | 105 ++++++++++--------
 .../technology/tabula/CommandLineApp.java     |  40 +++++--
 .../technology/tabula/ObjectExtractor.java    |  27 +++--
 src/main/java/technology/tabula/Page.java     |  20 +++-
 src/main/java/technology/tabula/Table.java    |   9 ++
 .../java/technology/tabula/TextStripper.java  |   8 +-
 src/main/java/technology/tabula/Utils.java    |  83 +++++++++++++-
 .../java/technology/tabula/debug/Debug.java   |   2 +-
 .../extractors/BasicExtractionAlgorithm.java  |   1 +
 .../SpreadsheetExtractionAlgorithm.java       |   8 +-
 .../tabula/outobjects/OutTable.java           |  33 ++++++
 .../tabula/writers/SJSONWriter.java           |  99 +++++++++++++++++
 .../tabula/TestObjectExtractor.java           |  20 ++--
 .../technology/tabula/TestTableDetection.java |   2 +-
 .../technology/tabula/UtilsForTesting.java    |   2 +-
 16 files changed, 380 insertions(+), 82 deletions(-)
 create mode 100644 src/main/java/technology/tabula/outobjects/OutTable.java
 create mode 100644 src/main/java/technology/tabula/writers/SJSONWriter.java
diff --git a/README.md b/README.md
index 2a08d3ac..58c6e4a5 100644
--- a/README.md
+++ b/README.md
@@ -37,7 +37,7 @@ Tabula helps you extract tables from PDFs
                             between 0-100 (inclusive) and preceded by '%',
                             input will be taken as % of actual width of
                             the page. Example: --columns %25,50,80.6
- -f,--format <FORMAT>       Output format: (CSV,TSV,JSON). Default: CSV
+ -f,--format <FORMAT>       Output format: (CSV,TSV,JSON，SJSON). Default: CSV
  -g,--guess                 Guess the portion of the page to analyze per
                             page.
  -h,--help                  Print this help text.
@@ -67,6 +67,7 @@ Tabula helps you extract tables from PDFs
  -u,--use-line-returns      Use embedded line returns in cells. (Only in
                             spreadsheet mode.)
  -v,--version               Print version and exit.
+ -tn, --tableNames           筛选要输出的表
 ```
 
 It also includes a debugging tool, run `java -cp ./target/tabula-1.0.2-jar-with-dependencies.jar technology.tabula.debug.Debug -h` for the available options.
diff --git a/pom.xml b/pom.xml
index b88e7f71..d89d54e1 100644
--- a/pom.xml
+++ b/pom.xml
@@ -124,26 +124,27 @@
                 </execution>
               </executions>
             </plugin>
-            <plugin>
-                <groupId>org.apache.maven.plugins</groupId>
-                <artifactId>maven-gpg-plugin</artifactId>
-                <version>1.6</version>
-                <executions>
-                    <execution>
-                        <id>sign-artifacts</id>
-                        <phase>verify</phase>
-                        <goals>
-                            <goal>sign</goal>
-                        </goals>
-                        <configuration>
-                            <gpgArguments>
-                                <arg>--pinentry-mode</arg>
-                                <arg>loopback</arg>
-                            </gpgArguments>
-                        </configuration>
-                    </execution>
-                </executions>
-            </plugin>
+            <!--<plugin>-->
+                <!--<groupId>org.apache.maven.plugins</groupId>-->
+                <!--<artifactId>maven-gpg-plugin</artifactId>-->
+                <!--<version>1.6</version>-->
+                <!--<executions>-->
+                    <!--<execution>-->
+                        <!--<id>sign-artifacts</id>-->
+                        <!--<phase>verify</phase>-->
+                        <!--<goals>-->
+                            <!--<goal>sign</goal>-->
+                        <!--</goals>-->
+                        <!--<configuration>-->
+                            <!--<gpgArguments>-->
+                                <!--<arg>&#45;&#45;pinentry-mode</arg>-->
+                                <!--<arg>loopback</arg>-->
+                            <!--</gpgArguments>-->
+                        <!--</configuration>-->
+                    <!--</execution>-->
+                <!--</executions>-->
+            <!--</plugin>-->
+
             <plugin>
                 <artifactId>maven-compiler-plugin</artifactId>
                 <version>3.8.1</version>
@@ -159,33 +160,43 @@
                         <manifest>
                             <mainClass>technology.tabula.CommandLineApp</mainClass>
                         </manifest>
-                </archive>
-                <descriptorRefs>
-                    <descriptorRef>jar-with-dependencies</descriptorRef>
-                </descriptorRefs>
+                    </archive>
+                    <descriptorRefs>
+                        <descriptorRef>jar-with-dependencies</descriptorRef>
+                    </descriptorRefs>
+                </configuration>
+                <executions>
+                    <execution>
+                        <id>make-assembly</id>
+                        <phase>package</phase>
+                        <goals>
+                            <goal>single</goal>
+                        </goals>
+                    </execution>
+                </executions>
+            </plugin>
+            <plugin>
+                <groupId>org.apache.maven.plugins</groupId>
+                <artifactId>maven-surefire-plugin</artifactId>
+                <version>2.22.2</version>
+                <configuration>
+                    <!-- Travis build workaround -->
+                    <argLine>-Xms1024m -Xmx2048m</argLine>
+                    <skipTests>true</skipTests>
                 </configuration>
-        </plugin>
-        <plugin>
-            <groupId>org.apache.maven.plugins</groupId>
-            <artifactId>maven-surefire-plugin</artifactId>
-            <version>2.22.2</version>
-            <configuration>
-                <!-- Travis build workaround -->
-                <argLine>-Xms1024m -Xmx2048m</argLine>
-            </configuration>
-        </plugin>
-        <!-- download source jars and link them when running eclipse:eclipse -->
-        <plugin>
-            <groupId>org.apache.maven.plugins</groupId>
-            <artifactId>maven-eclipse-plugin</artifactId>
-            <version>2.10</version>
-            <configuration>
-                <downloadSources>true</downloadSources>
-                <downloadJavadocs>true</downloadJavadocs>
-            </configuration>
-        </plugin>
-    </plugins>
-</build>
+            </plugin>
+            <!-- download source jars and link them when running eclipse:eclipse -->
+            <plugin>
+                <groupId>org.apache.maven.plugins</groupId>
+                <artifactId>maven-eclipse-plugin</artifactId>
+                <version>2.10</version>
+                <configuration>
+                    <downloadSources>true</downloadSources>
+                    <downloadJavadocs>true</downloadJavadocs>
+                </configuration>
+            </plugin>
+        </plugins>
+    </build>
 
 <profiles>
     <profile>
@@ -221,6 +232,7 @@
                         </execution>
                     </executions>
                 </plugin>
+                <!--
                 <plugin>
                     <groupId>org.apache.maven.plugins</groupId>
                     <artifactId>maven-gpg-plugin</artifactId>
@@ -235,6 +247,7 @@
                         </execution>
                     </executions>
                 </plugin>
+                -->
             </plugins>
         </build>
     </profile>
diff --git a/src/main/java/technology/tabula/CommandLineApp.java b/src/main/java/technology/tabula/CommandLineApp.java
index 0228df4b..7922267d 100644
--- a/src/main/java/technology/tabula/CommandLineApp.java
+++ b/src/main/java/technology/tabula/CommandLineApp.java
@@ -7,7 +7,9 @@
 import java.io.IOException;
 import java.util.ArrayList;
 import java.util.List;
+import java.util.Map;
 
+import javafx.scene.control.Tab;
 import org.apache.commons.cli.CommandLine;
 import org.apache.commons.cli.CommandLineParser;
 import org.apache.commons.cli.HelpFormatter;
@@ -17,14 +19,12 @@
 import org.apache.commons.cli.DefaultParser;
 import org.apache.pdfbox.pdmodel.PDDocument;
 
+import org.locationtech.jts.util.StringUtil;
 import technology.tabula.detectors.DetectionAlgorithm;
 import technology.tabula.detectors.NurminenDetectionAlgorithm;
 import technology.tabula.extractors.BasicExtractionAlgorithm;
 import technology.tabula.extractors.SpreadsheetExtractionAlgorithm;
-import technology.tabula.writers.CSVWriter;
-import technology.tabula.writers.JSONWriter;
-import technology.tabula.writers.TSVWriter;
-import technology.tabula.writers.Writer;
+import technology.tabula.writers.*;
 
 
 public class CommandLineApp {
@@ -44,6 +44,8 @@ public class CommandLineApp {
     private OutputFormat outputFormat;
     private String password;
     private TableExtractor tableExtractor;
+    private Map<String, List<String>> tableMap;
+
 
     public CommandLineApp(Appendable defaultOutput, CommandLine line) throws ParseException {
         this.defaultOutput = defaultOutput;
@@ -51,6 +53,7 @@ public CommandLineApp(Appendable defaultOutput, CommandLine line) throws ParseEx
         this.pages = CommandLineApp.whichPages(line);
         this.outputFormat = CommandLineApp.whichOutputFormat(line);
         this.tableExtractor = CommandLineApp.createExtractor(line);
+        this.tableMap = CommandLineApp.whichTableMap(line);
 
         if (line.hasOption('s')) {
             this.password = line.getOptionValue('s');
@@ -160,6 +163,10 @@ private void extractFile(File pdfFile, Appendable outFile) throws ParseException
             while (pageIterator.hasNext()) {
                 Page page = pageIterator.next();
 
+                if (page == null) {
+                    continue;
+                }
+
                 if (tableExtractor.verticalRulingPositions != null) {
                     for (Float verticalRulingPosition : tableExtractor.verticalRulingPositions) {
                         page.addRuling(new Ruling(0, verticalRulingPosition, 0.0f, (float) page.getHeight()));
@@ -195,7 +202,8 @@ private void extractFile(File pdfFile, Appendable outFile) throws ParseException
     }
 
     private PageIterator getPageIterator(PDDocument pdfDocument) throws IOException {
-        ObjectExtractor extractor = new ObjectExtractor(pdfDocument);
+        List<String>tableNames=new ArrayList<String>(tableMap.keySet());
+        ObjectExtractor extractor = new ObjectExtractor(pdfDocument, tableNames);
         return (pages == null) ?
                 extractor.extract() :
                 extractor.extract(pages);
@@ -244,9 +252,17 @@ private static List<Pair<Integer, Rectangle>> whichAreas(CommandLine line) throw
 
     private static List<Integer> whichPages(CommandLine line) throws ParseException {
         String pagesOption = line.hasOption('p') ? line.getOptionValue('p') : "1";
+        String tableName = line.hasOption("tn") ? line.getOptionValue("tn") : "";
+        if (!"".equals(tableName) && "1".equals(pagesOption))
+            pagesOption = "all";
         return Utils.parsePagesOption(pagesOption);
     }
 
+    private static Map<String,List<String>> whichTableMap(CommandLine line) throws ParseException{
+        String pagesOption = line.hasOption("tn") ? line.getOptionValue("tn") : "";
+        return Utils.parseTableMapOption(pagesOption);
+    }
+
     private static ExtractionMethod whichExtractionMethod(CommandLine line) {
         // -r/--spreadsheet [deprecated; use -l] or -l/--lattice
         if (line.hasOption('r') || line.hasOption('l')) {
@@ -358,7 +374,12 @@ public static Options buildOptions() {
                 .hasArg()
                 .argName("PAGES")
                 .build());
-
+        o.addOption(Option.builder("tn")
+                .longOpt("tableNames")
+                .desc("Comma separated list of TableName, or all. Examples: --tableName table1,table2")
+                .hasArg()
+                .argName("TABLENAMES")
+                .build());
         return o;
     }
 
@@ -462,6 +483,9 @@ private void writeTables(List<Table> tables, Appendable out) throws IOException
             case JSON:
                 writer = new JSONWriter();
                 break;
+            case SJSON:
+                writer = new SJSONWriter(tableMap);
+                break;
             case TSV:
                 writer = new TSVWriter();
                 break;
@@ -476,6 +500,7 @@ private String getOutputFilename(File pdfFile) {
                 extension = ".csv";
                 break;
             case JSON:
+            case SJSON:
                 extension = ".json";
                 break;
             case TSV:
@@ -488,7 +513,8 @@ private String getOutputFilename(File pdfFile) {
     private enum OutputFormat {
         CSV,
         TSV,
-        JSON;
+        JSON,
+        SJSON;
 
         static String[] formatNames() {
             OutputFormat[] values = OutputFormat.values();
diff --git a/src/main/java/technology/tabula/ObjectExtractor.java b/src/main/java/technology/tabula/ObjectExtractor.java
index 3998ba6f..5e35dca5 100644
--- a/src/main/java/technology/tabula/ObjectExtractor.java
+++ b/src/main/java/technology/tabula/ObjectExtractor.java
@@ -1,6 +1,7 @@
 package technology.tabula;
 
 import java.io.IOException;
+import java.util.List;
 
 import org.apache.pdfbox.pdmodel.PDDocument;
 import org.apache.pdfbox.pdmodel.PDPage;
@@ -9,8 +10,11 @@ public class ObjectExtractor {
 
     private final PDDocument pdfDocument;
 
-    public ObjectExtractor(PDDocument pdfDocument) {
+    private final List<String> tableNames;
+
+    public ObjectExtractor(PDDocument pdfDocument, List<String> tableNames) {
         this.pdfDocument = pdfDocument;
+        this.tableNames = tableNames;
     }
 
     protected Page extractPage(Integer pageNumber) throws IOException {
@@ -21,17 +25,22 @@ protected Page extractPage(Integer pageNumber) throws IOException {
         }
 
         PDPage p = this.pdfDocument.getPage(pageNumber - 1);
-
-        ObjectExtractorStreamEngine se = new ObjectExtractorStreamEngine(p);
-        se.processPage(p);
-
-
         TextStripper pdfTextStripper = new TextStripper(this.pdfDocument, pageNumber);
-
         pdfTextStripper.process();
-
+        String tableName = "";
+        //TODO 判断表名是否存在
+        if (tableNames != null){
+            //采用文本包含方式判断表名,后续需优化
+            tableName = Utils.findTableName(tableNames, pdfTextStripper.getContent());
+            if ("".equals(tableName)) {
+                return null;
+            }
+        }
         Utils.sort(pdfTextStripper.textElements, Rectangle.ILL_DEFINED_ORDER);
 
+        ObjectExtractorStreamEngine se = new ObjectExtractorStreamEngine(p);
+        se.processPage(p);
+
         float w, h;
         int pageRotation = p.getRotation();
         if (Math.abs(pageRotation) == 90 || Math.abs(pageRotation) == 270) {
@@ -43,7 +52,7 @@ protected Page extractPage(Integer pageNumber) throws IOException {
         }
 
         return new Page(0, 0, w, h, pageRotation, pageNumber, p, this.pdfDocument, pdfTextStripper.textElements,
-                se.rulings, pdfTextStripper.minCharWidth, pdfTextStripper.minCharHeight, pdfTextStripper.spatialIndex);
+                se.rulings, pdfTextStripper.minCharWidth, pdfTextStripper.minCharHeight, pdfTextStripper.spatialIndex, pdfTextStripper.getContent(), tableName);
     }
 
     public PageIterator extract(Iterable<Integer> pages) {
diff --git a/src/main/java/technology/tabula/Page.java b/src/main/java/technology/tabula/Page.java
index 3207bcb9..76a988e5 100644
--- a/src/main/java/technology/tabula/Page.java
+++ b/src/main/java/technology/tabula/Page.java
@@ -16,6 +16,8 @@ public class Page extends Rectangle {
   private Integer rotation;
   private int pageNumber;
   private List<TextElement> texts;
+  private String content;
+  private String tableName;
   private List<Ruling> rulings, cleanRulings = null, verticalRulingLines = null, horizontalRulingLines = null;
   private float minCharWidth;
   private float minCharHeight;
@@ -39,7 +41,6 @@ public Page(float top, float left, float width, float height, int rotation, int
     this.rulings = rulings;
   }
 
-
   public Page(float top, float left, float width, float height, int rotation, int page_number, PDPage pdPage, PDDocument doc,
               List<TextElement> characters, List<Ruling> rulings,
               float minCharWidth, float minCharHeight, RectangleSpatialIndex<TextElement> index) {
@@ -50,6 +51,15 @@ public Page(float top, float left, float width, float height, int rotation, int
     this.spatial_index = index;
   }
 
+  public Page(float top, float left, float width, float height, int rotation, int page_number, PDPage pdPage, PDDocument doc,
+              List<TextElement> characters, List<Ruling> rulings,
+              float minCharWidth, float minCharHeight, RectangleSpatialIndex<TextElement> index, String content, String tableName) {
+
+    this(top, left, width, height, rotation, page_number, pdPage, doc, characters, rulings,minCharHeight, minCharWidth, index);
+    this.content = content;
+    this.tableName = tableName;
+  }
+
   public Page getArea(Rectangle area) {
     List<TextElement> t = getText(area);
     float min_char_width  = 7;
@@ -236,6 +246,14 @@ public PDDocument getPDDoc() {
     return pdDoc;
   }
 
+  public String getContent() {
+    return content;
+  }
+
+  public String getTableName() {
+    return tableName;
+  }
+
   /** @deprecated with no replacement  */
   @Deprecated public RectangleSpatialIndex<TextElement> getSpatialIndex() {
     return this.spatial_index;
diff --git a/src/main/java/technology/tabula/Table.java b/src/main/java/technology/tabula/Table.java
index c031c9ed..529ca35a 100644
--- a/src/main/java/technology/tabula/Table.java
+++ b/src/main/java/technology/tabula/Table.java
@@ -23,12 +23,21 @@ public Table(ExtractionAlgorithm extractionAlgorithm) {
 
 	private int rowCount = 0;
 	private int colCount = 0;
+	private String tableName;
 
 	/* visible for testing */ final TreeMap<CellPosition, RectangularTextContainer> cells = new TreeMap<>();
 
 	public int getRowCount() { return rowCount; }
 	public int getColCount() { return colCount; }
 
+	public String getTableName() {
+		return tableName;
+	}
+
+	public void setTableName(String tableName) {
+		this.tableName = tableName;
+	}
+
 	public String getExtractionMethod() { return extractionMethod; }
 
 	public void add(RectangularTextContainer chunk, int row, int col) {
diff --git a/src/main/java/technology/tabula/TextStripper.java b/src/main/java/technology/tabula/TextStripper.java
index 329d45a2..fb526ea6 100644
--- a/src/main/java/technology/tabula/TextStripper.java
+++ b/src/main/java/technology/tabula/TextStripper.java
@@ -25,6 +25,11 @@ public class TextStripper extends PDFTextStripper {
     public float minCharHeight = Float.MAX_VALUE;
     public float totalHeight = 0.0f;
     public int countHeight = 0;
+    private String content;
+
+    public String getContent() {
+        return content;
+    }
 
     public TextStripper(PDDocument document, int pageNumber) throws IOException {
         super();
@@ -36,12 +41,13 @@ public TextStripper(PDDocument document, int pageNumber) throws IOException {
     }
 
     public void process() throws IOException {
-        this.getText(this.document);
+        content = this.getText(this.document);
     }
 
     @Override
     protected void writeString(String string, List<TextPosition> textPositions) throws IOException
     {
+        super.writeString(string, textPositions);
         for (TextPosition textPosition: textPositions)
         {
             if (textPosition == null) {
diff --git a/src/main/java/technology/tabula/Utils.java b/src/main/java/technology/tabula/Utils.java
index 00814429..66b5ef00 100644
--- a/src/main/java/technology/tabula/Utils.java
+++ b/src/main/java/technology/tabula/Utils.java
@@ -8,7 +8,10 @@
 import java.io.IOException;
 import java.math.BigDecimal;
 import java.util.*;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
 
+import javafx.scene.control.Tab;
 import org.apache.commons.cli.ParseException;
 import org.apache.pdfbox.pdmodel.PDDocument;
 import org.apache.pdfbox.pdmodel.PDPage;
@@ -58,7 +61,6 @@ public static Rectangle bounds(Collection<? extends Shape> shapes) {
         }
 
         return rv;
-
     }
 
     // range iterator
@@ -117,6 +119,28 @@ public static <T> List<List<T>> transpose(List<List<T>> table) {
         return ret;
     }
 
+    public static Table maxColTable(List<? extends Table> tables){
+        int colCount = 0;
+        Table table = null;
+        for (int i = 0; i< tables.size();i++) {
+            Table t = tables.get(i);
+            if (t.getColCount() > colCount || i == 0){
+                colCount = t.getColCount();
+                table = t;
+            }
+        }
+        return  table;
+    }
+
+    public static boolean isEmptyRow(List<String> rows){
+        for(String item: rows){
+            if (item != null && !"".equals(item)){
+                return false;
+            }
+        }
+        return  true;
+    }
+
 	/**
 	 * Wrap Collections.sort so we can fallback to a non-stable quicksort if we're
 	 * running on JDK7+
@@ -187,6 +211,63 @@ public static List<Integer> parsePagesOption(String pagesSpec) throws ParseExcep
         return rv;
     }
 
+    public static Map<String,List<String>> parseTableMapOption(String tableNamesSpec) throws ParseException {
+        if (tableNamesSpec.equals("")) {
+            return null;
+        }
+        Map<String,List<String>> rv = new HashMap<>();
+        String[] ranges = tableNamesSpec.split(",");
+        for (int i = 0; i < ranges.length; i++) {
+            List<String> cols = new ArrayList<>();
+            //解析表名和列
+            String[] tns = ranges[i].split("\\[");
+            if ("".equals(tns[0]))
+                continue;
+            String tableName = tns[0];
+            rv.put(tableName,cols);
+            List<String> colGroup= findContentByRegex(ranges[i], "\\\\[(.*?)]");
+            for(String str: colGroup){
+                String[] cns = str.split("|");
+                for(String item: cns) cols.add(item);
+            }
+        }
+        return rv;
+    }
+
+    public static List<String> findContentByRegex(String content,  String regex){
+        Pattern pattern = Pattern.compile(regex);
+        Matcher matcher = pattern.matcher(content);
+        List<String> lstStr = new ArrayList<>();
+        while (matcher.find()) {
+            lstStr.add(matcher.group(1));
+        }
+        return  lstStr;
+    }
+
+    public static boolean isMatch(String content, String regex){
+        Pattern pattern = Pattern.compile(regex);
+        Matcher matcher = pattern.matcher(content);
+        return matcher.matches();
+    }
+
+    public static boolean containTable(List<String> tableNames, String content){
+        for (String tableName: tableNames) {
+            if (content.contains(tableName)){
+                return true;
+            }
+        }
+        return  false;
+    }
+
+    public static String findTableName(List<String> tableNames, String content){
+        for (String tableName: tableNames) {
+            if (content.contains(tableName)){
+                return tableName;
+            }
+        }
+        return  "";
+    }
+
     public static void snapPoints(List<? extends Line2D.Float> rulings, float xThreshold, float yThreshold) {
 
         // collect points and keep a Line -> p1,p2 map
diff --git a/src/main/java/technology/tabula/debug/Debug.java b/src/main/java/technology/tabula/debug/Debug.java
index 91609045..0464515d 100644
--- a/src/main/java/technology/tabula/debug/Debug.java
+++ b/src/main/java/technology/tabula/debug/Debug.java
@@ -217,7 +217,7 @@ public static void renderPage(String pdfPath, String outPath, int pageNumber, Re
                                   boolean drawDetectedTables) throws IOException {
         PDDocument document = PDDocument.load(new File(pdfPath));
 
-        ObjectExtractor oe = new ObjectExtractor(document);
+        ObjectExtractor oe = new ObjectExtractor(document, null);
 
         Page page = oe.extract(pageNumber + 1);
 
diff --git a/src/main/java/technology/tabula/extractors/BasicExtractionAlgorithm.java b/src/main/java/technology/tabula/extractors/BasicExtractionAlgorithm.java
index dcd01695..aa4c1c7d 100644
--- a/src/main/java/technology/tabula/extractors/BasicExtractionAlgorithm.java
+++ b/src/main/java/technology/tabula/extractors/BasicExtractionAlgorithm.java
@@ -65,6 +65,7 @@ public int compare(Ruling arg0, Ruling arg1) {
         
         Table table = new Table(this);
         table.setRect(page.getLeft(), page.getTop(), page.getWidth(), page.getHeight());
+        table.setTableName(page.getTableName());
 
         for (int i = 0; i < lines.size(); i++) {
             Line line = lines.get(i);
diff --git a/src/main/java/technology/tabula/extractors/SpreadsheetExtractionAlgorithm.java b/src/main/java/technology/tabula/extractors/SpreadsheetExtractionAlgorithm.java
index c377507c..d6a3087e 100644
--- a/src/main/java/technology/tabula/extractors/SpreadsheetExtractionAlgorithm.java
+++ b/src/main/java/technology/tabula/extractors/SpreadsheetExtractionAlgorithm.java
@@ -132,6 +132,7 @@ else if (r.vertical()) {
             }
                         
             TableWithRulingLines t = new TableWithRulingLines(area, overlappingCells, horizontalOverlappingRulings, verticalOverlappingRulings, this);
+            t.setTableName(page.getTableName());
             spreadsheets.add(t);
         }
         Utils.sort(spreadsheets, Rectangle.ILL_DEFINED_ORDER);
@@ -154,7 +155,8 @@ public boolean isTabular(Page page) {
         if (tables.size() == 0) {
             return false;
         }
-        Table table = tables.get(0);
+        //Table table = tables.get(0);
+        Table table = Utils.maxColTable(tables);
         int rowsDefinedByLines = table.getRowCount();
         int colsDefinedByLines = table.getColCount();
         
@@ -167,8 +169,8 @@ public boolean isTabular(Page page) {
         int colsDefinedWithoutLines = table.getColCount();
         
         float ratio = (((float) colsDefinedByLines / colsDefinedWithoutLines) + ((float) rowsDefinedByLines / rowsDefinedWithoutLines)) / 2.0f;
-        
-        return ratio > MAGIC_HEURISTIC_NUMBER && ratio < (1/MAGIC_HEURISTIC_NUMBER);
+        //&& ratio < (1/MAGIC_HEURISTIC_NUMBER)
+        return ratio > MAGIC_HEURISTIC_NUMBER ;
     }
     
     public static List<Cell> findCells(List<Ruling> horizontalRulingLines, List<Ruling> verticalRulingLines) {
diff --git a/src/main/java/technology/tabula/outobjects/OutTable.java b/src/main/java/technology/tabula/outobjects/OutTable.java
new file mode 100644
index 00000000..cd2fd42a
--- /dev/null
+++ b/src/main/java/technology/tabula/outobjects/OutTable.java
@@ -0,0 +1,33 @@
+package technology.tabula.outobjects;
+
+import java.util.List;
+
+public class OutTable {
+    private String name;
+    private List<String> column;
+    private List<List<String>> data;
+
+    public String getName() {
+        return name;
+    }
+
+    public void setName(String name) {
+        this.name = name;
+    }
+
+    public List<String> getColumn() {
+        return column;
+    }
+
+    public void setColumn(List<String> column) {
+        this.column = column;
+    }
+
+    public List<List<String>> getData() {
+        return data;
+    }
+
+    public void setData(List<List<String>> data) {
+        this.data = data;
+    }
+}
diff --git a/src/main/java/technology/tabula/writers/SJSONWriter.java b/src/main/java/technology/tabula/writers/SJSONWriter.java
new file mode 100644
index 00000000..72e89989
--- /dev/null
+++ b/src/main/java/technology/tabula/writers/SJSONWriter.java
@@ -0,0 +1,99 @@
+package technology.tabula.writers;
+
+import com.google.gson.*;
+import technology.tabula.*;
+import technology.tabula.json.RectangularTextContainerSerializer;
+import technology.tabula.json.TableSerializer;
+import technology.tabula.outobjects.OutTable;
+
+import java.io.IOException;
+import java.lang.reflect.Modifier;
+import java.util.*;
+
+public class SJSONWriter implements Writer {
+
+	private Map<String, List<String>> tableMap;
+	private static final ExclusionStrategy ALLCLASSES_SKIPNONPUBLIC = new ExclusionStrategy() {
+		@Override public boolean shouldSkipClass(Class<?> c) { return false; }
+		@Override public boolean shouldSkipField(FieldAttributes fa) { return !fa.hasModifier(Modifier.PUBLIC); }
+	};
+
+	public SJSONWriter(Map<String, List<String>> tableMap){
+		this.tableMap = tableMap;
+	}
+
+	@Override
+	public void write(Appendable out, Table table) throws IOException {
+		write(out, Collections.singletonList(table));
+	}
+
+	@Override
+	public void write(Appendable out, List<Table> tables) throws IOException {
+		Gson gson = gson();
+		Map<String, OutTable> outTableMap = new HashMap<>();
+		OutTable outTable = null;
+		for (Table table : tables) {
+			if (table.getRowCount() > 0){
+				String tableName = table.getTableName();
+				if (outTableMap.containsKey(tableName)) {
+					outTable = outTableMap.get(tableName);
+				}
+				else {
+					outTable = new OutTable();
+					outTable.setName(tableName);
+					outTable.setColumn(new ArrayList<>());
+					outTable.setData(new ArrayList<>());
+					outTableMap.put(tableName, outTable);
+				}
+				int dataRow = 0;
+				//查找列的位置及数据开始位置
+				/*
+				Map<String, Integer> colPos = null;
+				if (tableMap != null) {
+					List<String> cols = tableMap.get(tableName);
+					for(String item: cols){
+						colPos.put(item, null);
+					}
+				}
+				*/
+
+				/*
+				if (colPos != null) {
+					for (int i = 0; i < table.getRows().size(); i++) {
+						List<RectangularTextContainer> row = table.getRows().get(i);
+						for (int j = 0; j< row.size(); j++) {
+							RectangularTextContainer<?> tc = row.get(j);
+							for (String key : colPos.keySet()) {
+								if (colPos.get(key) != null && Utils.isMatch(tc.getText(),key)){
+									outTable.getColumn().add(tc.getText());
+									colPos.put(key, Integer.valueOf(j));
+									dataRow = i;
+								}
+							}
+						}
+					}
+				}
+				*/
+				for(int i = dataRow; i< table.getRows().size(); i++){
+					List<RectangularTextContainer> row = table.getRows().get(i);
+					List<String> cells = new ArrayList<>(row.size());
+					for (RectangularTextContainer<?> tc : row) {
+						cells.add(tc.getText());
+					}
+					if (!Utils.isEmptyRow(cells)) outTable.getData().add(cells);
+				}
+			}
+		}
+		JsonArray array = new JsonArray();
+		for (Map.Entry<String, OutTable> m : outTableMap.entrySet()) {
+			array.add(gson.toJsonTree(m.getValue(), OutTable.class));
+		}
+		out.append(gson.toJson(array));
+	}
+
+	private static Gson gson() {
+		return new GsonBuilder()
+				.create();
+	}
+
+}
diff --git a/src/test/java/technology/tabula/TestObjectExtractor.java b/src/test/java/technology/tabula/TestObjectExtractor.java
index fe458b87..dda1ffdd 100644
--- a/src/test/java/technology/tabula/TestObjectExtractor.java
+++ b/src/test/java/technology/tabula/TestObjectExtractor.java
@@ -22,14 +22,14 @@ public void testWrongPasswordRaisesException() throws IOException {
     @Test(expected = IOException.class)
     public void testEmptyOnEncryptedFileRaisesException() throws IOException {
         PDDocument pdf_document = PDDocument.load(new File("src/test/resources/technology/tabula/encrypted.pdf"));
-        ObjectExtractor oe = new ObjectExtractor(pdf_document);
+        ObjectExtractor oe = new ObjectExtractor(pdf_document, null);
         oe.extract().next();
     }
 
     @Test
     public void testCanReadPDFWithOwnerEncryption() throws IOException {
         PDDocument pdf_document = PDDocument.load(new File("src/test/resources/technology/tabula/S2MNCEbirdisland.pdf"));
-        ObjectExtractor oe = new ObjectExtractor(pdf_document);
+        ObjectExtractor oe = new ObjectExtractor(pdf_document,null);
         PageIterator pi = oe.extract();
         int i = 0;
         while (pi.hasNext()) {
@@ -43,7 +43,7 @@ public void testCanReadPDFWithOwnerEncryption() throws IOException {
     @Test
     public void testGoodPassword() throws IOException {
         PDDocument pdf_document = PDDocument.load(new File("src/test/resources/technology/tabula/encrypted.pdf"), "userpassword");
-        ObjectExtractor oe = new ObjectExtractor(pdf_document);
+        ObjectExtractor oe = new ObjectExtractor(pdf_document, null);
         List<Page> pages = new ArrayList<>();
         PageIterator pi = oe.extract();
         while (pi.hasNext()) {
@@ -56,7 +56,7 @@ public void testGoodPassword() throws IOException {
     @Test
     public void testTextExtractionDoesNotRaise() throws IOException {
         PDDocument pdf_document = PDDocument.load(new File("src/test/resources/technology/tabula/rotated_page.pdf"));
-        ObjectExtractor oe = new ObjectExtractor(pdf_document);
+        ObjectExtractor oe = new ObjectExtractor(pdf_document, null);
         PageIterator pi = oe.extract();
 
         assertTrue(pi.hasNext());
@@ -68,7 +68,7 @@ public void testTextExtractionDoesNotRaise() throws IOException {
     @Test
     public void testShouldDetectRulings() throws IOException {
         PDDocument pdf_document = PDDocument.load(new File("src/test/resources/technology/tabula/should_detect_rulings.pdf"));
-        ObjectExtractor oe = new ObjectExtractor(pdf_document);
+        ObjectExtractor oe = new ObjectExtractor(pdf_document, null);
         PageIterator pi = oe.extract();
 
         Page page = pi.next();
@@ -82,7 +82,7 @@ public void testShouldDetectRulings() throws IOException {
     @Test
     public void testDontThrowNPEInShfill() throws IOException {
         PDDocument pdf_document = PDDocument.load(new File("src/test/resources/technology/tabula/labor.pdf"));
-        ObjectExtractor oe = new ObjectExtractor(pdf_document);
+        ObjectExtractor oe = new ObjectExtractor(pdf_document, null);
         PageIterator pi = oe.extract();
         assertTrue(pi.hasNext());
         try {
@@ -98,7 +98,7 @@ public void testExtractOnePage() throws IOException {
         PDDocument pdf_document = PDDocument.load(new File("src/test/resources/technology/tabula/S2MNCEbirdisland.pdf"));
         assertEquals(2, pdf_document.getNumberOfPages());
 
-        ObjectExtractor oe = new ObjectExtractor(pdf_document);
+        ObjectExtractor oe = new ObjectExtractor(pdf_document, null);
         Page page = oe.extract(2);
 
         assertNotNull(page);
@@ -110,7 +110,7 @@ public void testExtractWrongPageNumber() throws IOException {
         PDDocument pdf_document = PDDocument.load(new File("src/test/resources/technology/tabula/S2MNCEbirdisland.pdf"));
         assertEquals(2, pdf_document.getNumberOfPages());
 
-        ObjectExtractor oe = new ObjectExtractor(pdf_document);
+        ObjectExtractor oe = new ObjectExtractor(pdf_document, null);
         oe.extract(3);
 
     }
@@ -118,7 +118,7 @@ public void testExtractWrongPageNumber() throws IOException {
     @Test
     public void testTextElementsContainedInPage() throws IOException {
         PDDocument pdf_document = PDDocument.load(new File("src/test/resources/technology/tabula/cs-en-us-pbms.pdf"));
-        ObjectExtractor oe = new ObjectExtractor(pdf_document);
+        ObjectExtractor oe = new ObjectExtractor(pdf_document, null);
 
         Page page = oe.extractPage(1);
 
@@ -129,7 +129,7 @@ public void testTextElementsContainedInPage() throws IOException {
 
     @Test public void testDoNotNPEInPointComparator() throws IOException {
         PDDocument pdf_document = PDDocument.load(new File("src/test/resources/technology/tabula/npe_issue_206.pdf"));
-        ObjectExtractor oe = new ObjectExtractor(pdf_document);
+        ObjectExtractor oe = new ObjectExtractor(pdf_document, null);
 
         try {
             Page p = oe.extractPage(1);
diff --git a/src/test/java/technology/tabula/TestTableDetection.java b/src/test/java/technology/tabula/TestTableDetection.java
index 6e58f6a4..3efdf2d8 100644
--- a/src/test/java/technology/tabula/TestTableDetection.java
+++ b/src/test/java/technology/tabula/TestTableDetection.java
@@ -163,7 +163,7 @@ public void testDetectionOfTables() throws Exception {
 
         // tabula extractors
         PDDocument pdfDocument = PDDocument.load(this.pdf);
-        ObjectExtractor extractor = new ObjectExtractor(pdfDocument);
+        ObjectExtractor extractor = new ObjectExtractor(pdfDocument, null);
 
         // parse expected tables from the ground truth dataset
         Map<Integer, List<Rectangle>> expectedTables = new HashMap<>();
diff --git a/src/test/java/technology/tabula/UtilsForTesting.java b/src/test/java/technology/tabula/UtilsForTesting.java
index 3ee8efde..6e82b4b5 100644
--- a/src/test/java/technology/tabula/UtilsForTesting.java
+++ b/src/test/java/technology/tabula/UtilsForTesting.java
@@ -25,7 +25,7 @@ public static Page getPage(String path, int pageNumber) throws IOException {
         try {
             PDDocument document = PDDocument
                     .load(new File(path));
-            oe = new ObjectExtractor(document);
+            oe = new ObjectExtractor(document, null);
             Page page = oe.extract(pageNumber);
             return page;
         } finally {