diff --git a/src/main/java/org/dataspread/sheetanalyzer/SheetAnalyzer.java b/src/main/java/org/dataspread/sheetanalyzer/SheetAnalyzer.java index 59b6a98..a1bb2f6 100644 --- a/src/main/java/org/dataspread/sheetanalyzer/SheetAnalyzer.java +++ b/src/main/java/org/dataspread/sheetanalyzer/SheetAnalyzer.java @@ -75,8 +75,15 @@ public static SheetAnalyzer createSheetAnalyzer(Map spreadsh * * @return */ - public abstract Map>, - Map>>> getTACODepGraphs(); + public abstract Map>, Map>>> getTACODepGraphs(); + + /** + * Get a TACO graph without overlapping refs for visualization-friendly + * purposes. + * + * @return + */ + public abstract Map>, Map>>> getNonOverlappingGraphs(); /** * Get the formula clusters diff --git a/src/main/java/org/dataspread/sheetanalyzer/analyzer/SheetAnalyzerImpl.java b/src/main/java/org/dataspread/sheetanalyzer/analyzer/SheetAnalyzerImpl.java index b0dd9b2..e6c1308 100644 --- a/src/main/java/org/dataspread/sheetanalyzer/analyzer/SheetAnalyzerImpl.java +++ b/src/main/java/org/dataspread/sheetanalyzer/analyzer/SheetAnalyzerImpl.java @@ -10,6 +10,7 @@ import org.dataspread.sheetanalyzer.parser.POIParser; import org.dataspread.sheetanalyzer.SheetAnalyzer; import org.dataspread.sheetanalyzer.data.CellContent; +import org.dataspread.sheetanalyzer.data.SheetData; import org.dataspread.sheetanalyzer.util.Pair; import org.dataspread.sheetanalyzer.util.Ref; @@ -83,8 +84,7 @@ public Set getDependents(String sheetName, Ref ref) { } @Override - public Map>, - Map>>> getTACODepGraphs() { + public Map>, Map>>> getTACODepGraphs() { Map>, Map>>> tacoDepGraphs = new HashMap<>(); this.depGraphMap.forEach((sheetName, depGraph) -> { tacoDepGraphs.put(sheetName, @@ -93,6 +93,16 @@ Map>>> getTACODepGraphs() { return tacoDepGraphs; } + @Override + public Map>, Map>>> getNonOverlappingGraphs() { + Map sheetDataMap = this.parser.getSheetData(); + this.depGraphMap.forEach((sheetName, depGraph) -> { + SheetData sheetData = sheetDataMap.get(sheetName); + ((DependencyGraphTACO) depGraph).pruneOverlappingRefs(sheetData); + }); + return getTACODepGraphs(); + } + /** * Returns a map where each key is a sheet name and * each value is another map. In the nested map, each diff --git a/src/main/java/org/dataspread/sheetanalyzer/data/SheetData.java b/src/main/java/org/dataspread/sheetanalyzer/data/SheetData.java index e904cc1..0b60ad1 100644 --- a/src/main/java/org/dataspread/sheetanalyzer/data/SheetData.java +++ b/src/main/java/org/dataspread/sheetanalyzer/data/SheetData.java @@ -16,6 +16,8 @@ public class SheetData { private final Map refMetadata = new HashMap<>(); private final Set accessAreaCache = new HashSet<>(); private final String sheetName; + private int _maxRow; + private int _maxCol; public SheetData(String sheetName) { this.sheetName = sheetName; @@ -96,6 +98,22 @@ public String getSheetName() { return this.sheetName; } + public int getMaxRow() { + return this._maxRow; + } + + public int getMaxCol() { + return this._maxCol; + } + + public void setMaxRow(int maxRow) { + this._maxRow = maxRow; + } + + public void setMaxCol(int maxCol) { + this._maxCol = maxCol; + } + // Notice that in the following methods we return a copy // of the underlying metadata so that the caller doesn't // accidently re-assign any of the values. diff --git a/src/main/java/org/dataspread/sheetanalyzer/dependency/DependencyGraphTACO.java b/src/main/java/org/dataspread/sheetanalyzer/dependency/DependencyGraphTACO.java index 4cb7d76..0b266ba 100644 --- a/src/main/java/org/dataspread/sheetanalyzer/dependency/DependencyGraphTACO.java +++ b/src/main/java/org/dataspread/sheetanalyzer/dependency/DependencyGraphTACO.java @@ -6,9 +6,12 @@ import com.google.common.base.Function; import com.google.common.collect.Iterators; import org.checkerframework.checker.nullness.qual.Nullable; +import org.dataspread.sheetanalyzer.data.CellContent; +import org.dataspread.sheetanalyzer.data.SheetData; import org.dataspread.sheetanalyzer.dependency.util.*; import org.dataspread.sheetanalyzer.util.Pair; import org.dataspread.sheetanalyzer.util.Ref; +import org.dataspread.sheetanalyzer.util.RefImpl; import java.util.*; import java.util.concurrent.atomic.AtomicLong; @@ -21,6 +24,7 @@ public class DependencyGraphTACO implements DependencyGraph { protected Map> precToDepList = new HashMap<>(); protected Map> depToPrecList = new HashMap<>(); private RTree _rectToRef = RTree.create(); + private Set _visitedRefs = new HashSet<>(); private final CompressInfoComparator compressInfoComparator = new CompressInfoComparator(); @@ -53,20 +57,57 @@ private void getDependentsInternal(Ref precUpdate, Set depUpdateRefSet = findUpdateDepRef(precRef, depRefWithMeta.getRef(), depRefWithMeta.getEdgeMeta(), realUpdateRef); depUpdateRefSet.forEach(depUpdateRef -> { - LinkedList overlapRef = getNonOverlapRef(resultSet.get(), depUpdateRef); - overlapRef.forEach(olRef -> { - resultSet.set(resultSet.get().add(olRef, RefUtils.refToRect(olRef))); - result.add(olRef); - if (!isDirectDep) { - updateQueue.add(olRef); - } - }); + updateResult(result, isDirectDep, resultSet, updateQueue, depUpdateRef); }); }); } } } + private void updateResult(LinkedHashSet result, boolean isDirectDep, + AtomicReference> resultSet, Queue updateQueue, Ref depUpdateRef) { + LinkedList overlapRef = getNonOverlapRef(resultSet.get(), depUpdateRef); + overlapRef.forEach(olRef -> { + resultSet.set(resultSet.get().add(olRef, RefUtils.refToRect(olRef))); + result.add(olRef); + if (!isDirectDep) + updateQueue.add(olRef); + }); + } + + public Set getPrecedents(Ref dependent) { + final boolean isDirectDep = false; + LinkedHashSet result = new LinkedHashSet<>(); + + if (RefUtils.isValidRef(dependent)) + getPrecedentInternal(dependent, result, isDirectDep); + return result; + } + + private void getPrecedentInternal(Ref depUpdate, + LinkedHashSet result, + boolean isDirectPrec) { + AtomicReference> resultSet = new AtomicReference<>(RTree.create()); + Queue updateQueue = new LinkedList<>(); + updateQueue.add(depUpdate); + while (!updateQueue.isEmpty()) { + Ref updateRef = updateQueue.remove(); + Iterator refIter = findOverlappingRefs(updateRef); + while (refIter.hasNext()) { + Ref depRef = refIter.next(); + Ref realUpdateRef = updateRef.getOverlap(depRef); + for (RefWithMeta precRefWithMeta : findPrecs(depRef)) { + Ref precUpdateRef = findUpdatePrecRef(depRef, precRefWithMeta.getRef(), + precRefWithMeta.getEdgeMeta(), realUpdateRef, isDirectPrec); + if (precUpdateRef != null) { + + updateResult(result, isDirectPrec, resultSet, updateQueue, precUpdateRef); + } + } + } + } + } + private LinkedList getNonOverlapRef(RTree resultSet, Ref input) { LinkedList retRefList = new LinkedList<>(); retRefList.addLast(input); @@ -81,6 +122,73 @@ private LinkedList getNonOverlapRef(RTree resultSet, Ref in return retRefList; } + public void pruneOverlappingRefs(SheetData sheetData) { + for (int i = 0; i <= sheetData.getMaxCol(); i++) { + for (int j = 0; j < sheetData.getMaxRow(); j++) { + Ref ref = new RefImpl(j, i); + CellContent cellContent = sheetData.getCellContent(ref); + if (_visitedRefs.contains(ref) || !cellContent.isFormula()) { + continue; + } + findMaxOverlapRange(i, j, Direction.TODOWN, sheetData); + findMaxOverlapRange(i, j, Direction.TORIGHT, sheetData); + } + } + } + + private void findMaxOverlapRange(int col, int row, Direction direction, SheetData sheetData) { + Set overlappingTargetRefs = new HashSet<>(); + Ref targetRef = new RefImpl(row, col); + findOverlappingRefs(targetRef).forEachRemaining(overlappingTargetRefs::add); + if (direction != Direction.TODOWN && direction != Direction.TORIGHT) { + throw new IllegalArgumentException("Direction must be either TODOWN or TORIGHT"); + } + boolean isDown = direction == Direction.TODOWN; + int start = (isDown ? row : col) + 1; + int max = isDown ? sheetData.getMaxRow() : sheetData.getMaxCol(); + for (int i = start; i < max; i++) { + Ref currentRef = new RefImpl(isDown ? i : row, isDown ? col : i); + CellContent cellContent = sheetData.getCellContent(currentRef); + if (!cellContent.isFormula()) { + break; + } + Set overlappingCurrentRefs = new HashSet<>(); + findOverlappingRefs(currentRef).forEachRemaining(overlappingCurrentRefs::add); + if (!overlappingTargetRefs.equals(overlappingCurrentRefs)) { + // overlappingTargetRefs.retainAll(overlappingCurrentRefs); + overlappingCurrentRefs.retainAll(overlappingTargetRefs); + for (Ref overlappingTargetRef : overlappingCurrentRefs) { + // for (Ref overlappingTargetRef : overlappingTargetRefs) { + Set precedents = new HashSet<>(); + findPrecs(overlappingTargetRef).forEach(precedents::add); + for (RefWithMeta precRangeWithMeta : precedents) { + Ref precRef = precRangeWithMeta.getRef(); + EdgeMeta edgeMeta = precRangeWithMeta.getEdgeMeta(); + List> newEdges = deleteOneCell(precRef, + overlappingTargetRef, + edgeMeta, + targetRef); + deleteMemEntry(precRef, overlappingTargetRef, edgeMeta); + Ref deletedRef = new RefImpl(precRef.getRow(), precRef.getColumn()); + add(deletedRef, targetRef); + newEdges.forEach(pair -> { + Ref newPrec = pair.first; + Ref newDep = pair.second.getRef(); + EdgeMeta newEdgeMeta = pair.second.getEdgeMeta(); + if (newDep.getType() == Ref.RefType.CELL) { + add(newPrec, newDep); + } else { + insertMemEntry(newPrec, newDep, newEdgeMeta); + } + }); + } + } + } else { + _visitedRefs.add(currentRef); + } + } + } + public long getNumEdges() { AtomicLong numEdges = new AtomicLong(0); depToPrecList.forEach((dep, precSet) -> { diff --git a/src/main/java/org/dataspread/sheetanalyzer/parser/POIParser.java b/src/main/java/org/dataspread/sheetanalyzer/parser/POIParser.java index 98b1947..7367f57 100644 --- a/src/main/java/org/dataspread/sheetanalyzer/parser/POIParser.java +++ b/src/main/java/org/dataspread/sheetanalyzer/parser/POIParser.java @@ -117,8 +117,6 @@ private void parseSpreadsheet() throws SheetNotSupportedException { private SheetData parseOneSheet(Sheet sheet) throws SheetNotSupportedException { SheetData sheetData = new SheetData(sheet.getSheetName()); - int maxRows = 0; - int maxCols = 0; for (Row row : sheet) { for (Cell cell : row) { if (cell != null) { @@ -130,12 +128,12 @@ private SheetData parseOneSheet(Sheet sheet) throws SheetNotSupportedException { sheetData.addContent(dep, cellContent); } } - if (cell.getColumnIndex() > maxCols) { - maxCols = cell.getColumnIndex(); + if (cell.getColumnIndex() > sheetData.getMaxCol()) { + sheetData.setMaxCol(cell.getColumnIndex()); } } - if (row.getRowNum() > maxRows) { - maxRows = row.getRowNum(); + if (row.getRowNum() > sheetData.getMaxRow()) { + sheetData.setMaxRow(row.getRowNum()); } } return sheetData; diff --git a/src/test/java/org/dataspread/sheetanalyzer/tacoTest/TestPruneOverlappingRefs.java b/src/test/java/org/dataspread/sheetanalyzer/tacoTest/TestPruneOverlappingRefs.java new file mode 100644 index 0000000..1af1e63 --- /dev/null +++ b/src/test/java/org/dataspread/sheetanalyzer/tacoTest/TestPruneOverlappingRefs.java @@ -0,0 +1,142 @@ +package org.dataspread.sheetanalyzer.tacoTest; + +import java.io.File; +import java.io.FileOutputStream; +import java.io.IOException; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; + +import org.apache.poi.hssf.usermodel.HSSFWorkbook; +import org.apache.poi.ss.usermodel.Cell; +import org.apache.poi.ss.usermodel.Row; +import org.apache.poi.ss.usermodel.Sheet; +import org.apache.poi.ss.usermodel.Workbook; +import org.dataspread.sheetanalyzer.SheetAnalyzer; +import org.dataspread.sheetanalyzer.dependency.util.RefWithMeta; +import org.dataspread.sheetanalyzer.util.Pair; +import org.dataspread.sheetanalyzer.util.Ref; +import org.dataspread.sheetanalyzer.util.RefImpl; +import org.dataspread.sheetanalyzer.util.SheetNotSupportedException; +import org.dataspread.sheetanalyzer.util.TestUtil; +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.Test; + +public class TestPruneOverlappingRefs { + + private static SheetAnalyzer sheetAnalyzer; + private static final String sheetName = "OverlapSheet"; + private static final int maxRows = 10; + + private static File createOverlapSheet1() throws IOException { + Workbook workbook = new HSSFWorkbook(); + Sheet sheet = workbook.createSheet(sheetName); + int colA = 0, colB = 1, colC = 2; + Row row = sheet.createRow(0); + Cell cellA = row.createCell(colA); + Cell cellB = row.createCell(colB); + Cell cellC = row.createCell(colC); + cellA.setCellValue(1); + cellB.setCellValue(10); + cellC.setCellFormula("A1 + B1"); + for (int i = 1; i < maxRows; i++) { + row = sheet.createRow(i); + cellA = row.createCell(colA); + cellB = row.createCell(colB); + cellC = row.createCell(colC); + cellA.setCellValue(i + 1); + cellB.setCellValue(10); + cellC.setCellFormula("A" + (i + 1)); + } + TestUtil.createAnEmptyRowWithTwoCols(sheet, maxRows, colA, colB); + + File xlsTempFile = TestUtil.createXlsTempFile(); + FileOutputStream outputStream = new FileOutputStream(xlsTempFile); + + workbook.write(outputStream); + workbook.close(); + + return xlsTempFile; + } + + private static File createNormalSheet() throws IOException { + Workbook workbook = new HSSFWorkbook(); + Sheet sheet = workbook.createSheet(sheetName); + int colA = 0, colB = 1, colC = 2; + for (int i = 0; i < maxRows; i++) { + Row row = sheet.createRow(i); + Cell cellA = row.createCell(colA); + Cell cellB = row.createCell(colB); + Cell cellC = row.createCell(colC); + cellA.setCellValue(i + 1); + cellB.setCellValue(10); + cellC.setCellFormula("A" + (i + 1)); + } + TestUtil.createAnEmptyRowWithTwoCols(sheet, maxRows, colA, colB); + + File xlsTempFile = TestUtil.createXlsTempFile(); + FileOutputStream outputStream = new FileOutputStream(xlsTempFile); + + workbook.write(outputStream); + workbook.close(); + + return xlsTempFile; + } + + @Test + public void testNormalSheet() throws IOException, SheetNotSupportedException { + File normalSheet = createNormalSheet(); + sheetAnalyzer = SheetAnalyzer.createSheetAnalyzer(normalSheet.getAbsolutePath()); + SheetAnalyzer sheetAnalyzer2 = SheetAnalyzer.createSheetAnalyzer(normalSheet.getAbsolutePath()); + + Map>, Map>>> depGraph1 = sheetAnalyzer + .getTACODepGraphs(); + + Map>, Map>>> depGraph2 = sheetAnalyzer2 + .getNonOverlappingGraphs(); + + Set depToPrec = depGraph1.get(sheetName).first.keySet(); + Set depToPrecGroundTruth = depGraph2.get(sheetName).first.keySet(); + Assertions.assertTrue(TestUtil.hasSameRefs(depToPrec, depToPrecGroundTruth)); + + Set precToDep = depGraph1.get(sheetName).first.keySet(); + Set precToDepGroundTruth = depGraph2.get(sheetName).first.keySet(); + Assertions.assertTrue(TestUtil.hasSameRefs(precToDep, precToDepGroundTruth)); + } + + /** + * A1 and B1 of RF pattern is only referenced by cell C1. + * | 1 | 10 | =A1+B1 | + * | 2 | 10 | =A2 | + * | 3 | 10 | =A3 | + * | 4 | 10 | =A4 | + * + * Originally is A1:13 -> C1:C3, B1 -> C1 + * Transformed into A2:A3 -> C2:C3, A1 -> C1, B1 -> C1 + */ + @Test + public void testOverlapSheet1() throws IOException, SheetNotSupportedException { + File sheet1 = createOverlapSheet1(); + sheetAnalyzer = SheetAnalyzer.createSheetAnalyzer(sheet1.getAbsolutePath()); + + Map>, Map>>> depGraph = sheetAnalyzer + .getNonOverlappingGraphs(); + Pair>, Map>> sheet = depGraph.get(sheetName); + Assertions.assertEquals(3, sheet.first.size()); + Assertions.assertEquals(2, sheet.second.size()); + Set depToPrecSet = sheet.first.keySet(); + Set depToPrecGroundTruth = new HashSet<>(); + depToPrecGroundTruth.add(new RefImpl(0, 0)); + depToPrecGroundTruth.add(new RefImpl(0, 1)); + depToPrecGroundTruth.add(new RefImpl(1, 0, maxRows - 1, 0)); + Assertions.assertTrue(TestUtil.hasSameRefs(depToPrecSet, depToPrecGroundTruth)); + + Set precToDepSet = sheet.second.keySet(); + Set precToDepGroundTruth = new HashSet<>(); + precToDepGroundTruth.add(new RefImpl(0, 2)); + precToDepGroundTruth.add(new RefImpl(1, 2, maxRows - 1, 2)); + Assertions.assertTrue(TestUtil.hasSameRefs(precToDepSet, precToDepGroundTruth)); + } + +}