From 513bf0dc541aeda951f596650ed893d1bae32f26 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Debora=20W=C3=B6pcke?= Date: Sat, 12 Jun 2021 07:15:19 +0200 Subject: [PATCH 01/42] Clarify parameter name --- src/plugins/Spider/Spider.java | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/plugins/Spider/Spider.java b/src/plugins/Spider/Spider.java index 3937914..882df12 100644 --- a/src/plugins/Spider/Spider.java +++ b/src/plugins/Spider/Spider.java @@ -503,8 +503,8 @@ protected void onSuccess(FetchResult result, ClientGetter state, Page page) { } } - protected void onFailure(FetchException fe, ClientGetter state, Page page) { - Logger.minor(this, "Failed: " + page + " : " + state, fe); + protected void onFailure(FetchException fe, ClientGetter getter, Page page) { + Logger.minor(this, "Failed: " + page + " : " + getter, fe); synchronized (this) { if (stopped) return; @@ -517,7 +517,7 @@ protected void onFailure(FetchException fe, ClientGetter state, Page page) { synchronized (page) { if (fe.newURI != null) { // redirect, mark as succeeded - queueURI(fe.newURI, "redirect from " + state.getURI(), false); + queueURI(fe.newURI, "redirect from " + getter.getURI(), false); page.setStatus(Status.SUCCEEDED); } else if (fe.isFatal()) { // too many tries or fatal, mark as failed From 1706991c20a35995279c7040dde9aeef7c4c39b0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Debora=20W=C3=B6pcke?= Date: Sat, 12 Jun 2021 07:15:57 +0200 Subject: [PATCH 02/42] Show info on pages --- src/plugins/Spider/db/Page.java | 6 ++++++ src/plugins/Spider/web/MainPage.java | 9 +++++++++ 2 files changed, 15 insertions(+) diff --git a/src/plugins/Spider/db/Page.java b/src/plugins/Spider/db/Page.java index 40c2c4f..297209e 100644 --- a/src/plugins/Spider/db/Page.java +++ b/src/plugins/Spider/db/Page.java @@ -3,6 +3,8 @@ */ package plugins.Spider.db; +import java.util.Date; + import freenet.support.Logger; import plugins.Spider.org.garret.perst.FieldIndex; import plugins.Spider.org.garret.perst.IPersistentMap; @@ -74,6 +76,10 @@ public String getPageTitle() { return pageTitle; } + public String getLastChange() { + return new Date(lastChange).toString(); + } + @Override public int hashCode() { return (int) (id ^ (id >>> 32)); diff --git a/src/plugins/Spider/web/MainPage.java b/src/plugins/Spider/web/MainPage.java index c3741d4..bf7833b 100644 --- a/src/plugins/Spider/web/MainPage.java +++ b/src/plugins/Spider/web/MainPage.java @@ -219,6 +219,15 @@ private void listPages(PageStatus pageStatus, HTMLNode parent) { for (Page page : pageStatus.pages) { HTMLNode litem = list.addChild("li", "title", page.getComment()); litem.addChild("a", "href", "/freenet:" + page.getURI(), page.getURI()); + String title = page.getPageTitle(); + if (title == null) { + title = ""; + } + litem.addChild("p", + " " + + page.getLastChange() + " " + + title + " " + + "(" + page.getComment() + ")"); } } } From 0b951a9b3f753379432efc1c745e763282d284d9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Debora=20W=C3=B6pcke?= Date: Sat, 12 Jun 2021 10:53:27 +0200 Subject: [PATCH 03/42] Add new category "NEW" for never attempted pages The purpose is to have newly found pages fetched before attempting to fetch all failed pages again. --- src/plugins/Spider/LibraryBuffer.java | 2 +- src/plugins/Spider/Spider.java | 31 +++++++++++++++++++++++++-- src/plugins/Spider/db/Page.java | 2 +- src/plugins/Spider/db/PerstRoot.java | 10 ++++++--- src/plugins/Spider/db/Status.java | 7 +++++- src/plugins/Spider/web/MainPage.java | 10 +++++++++ 6 files changed, 54 insertions(+), 8 deletions(-) diff --git a/src/plugins/Spider/LibraryBuffer.java b/src/plugins/Spider/LibraryBuffer.java index 44d34eb..151450b 100644 --- a/src/plugins/Spider/LibraryBuffer.java +++ b/src/plugins/Spider/LibraryBuffer.java @@ -97,7 +97,7 @@ public synchronized int bufferUsageEstimate() { public void start() { // Do in a transaction so it gets committed separately. spider.db.beginThreadTransaction(Storage.EXCLUSIVE_TRANSACTION); - spider.resetPages(Status.NOT_PUSHED, Status.QUEUED); + spider.resetPages(Status.NOT_PUSHED, Status.NEW); spider.db.endThreadTransaction(); } diff --git a/src/plugins/Spider/Spider.java b/src/plugins/Spider/Spider.java index 882df12..b56fc14 100644 --- a/src/plugins/Spider/Spider.java +++ b/src/plugins/Spider/Spider.java @@ -88,7 +88,7 @@ public class Spider implements FredPlugin, FredPluginThreadless, */ protected Set allowedMIMETypes; - static int dbVersion = 45; + static int dbVersion = 46; static int version = 53; /** We use the standard http://127.0.0.1:8888/ for parsing HTML regardless of what the local @@ -183,8 +183,10 @@ public void queueURI(FreenetURI uri, String comment, boolean force) { try { Page page = getRoot().getPageByURI(uri, true, comment); if (force && page.getStatus() != Status.QUEUED) { - page.setStatus(Status.QUEUED); page.setComment(comment); + if (page.getStatus() != Status.NEW) { + page.setStatus(Status.QUEUED); + } } db.endThreadTransaction(); @@ -217,6 +219,31 @@ public void startSomeRequests() { // Prepare to start toStart = new ArrayList(maxParallelRequests - running); db.beginThreadTransaction(Storage.COOPERATIVE_TRANSACTION); + getRoot().sharedLockPages(Status.NEW); + try { + Iterator it = getRoot().getPages(Status.NEW); + + while (running + toStart.size() < maxParallelRequests && it.hasNext()) { + Page page = it.next(); + // Skip if getting this page already + if (runningFetch.containsKey(page)) continue; + + try { + ClientGetter getter = makeGetter(page); + + Logger.minor(this, "Starting " + getter + " " + page); + toStart.add(getter); + runningFetch.put(page, getter); + } catch (MalformedURLException e) { + Logger.error(this, "IMPOSSIBLE-Malformed URI: " + page, e); + page.setStatus(Status.FAILED); + } + } + } finally { + getRoot().unlockPages(Status.NEW); + db.endThreadTransaction(); + } + db.beginThreadTransaction(Storage.COOPERATIVE_TRANSACTION); getRoot().sharedLockPages(Status.QUEUED); try { Iterator it = getRoot().getPages(Status.QUEUED); diff --git a/src/plugins/Spider/db/Page.java b/src/plugins/Spider/db/Page.java index 297209e..a38e744 100644 --- a/src/plugins/Spider/db/Page.java +++ b/src/plugins/Spider/db/Page.java @@ -32,7 +32,7 @@ public Page() { Page(String uri, String comment, Storage storage) { this.uri = uri; this.comment = comment; - this.status = Status.QUEUED; + this.status = Status.NEW; this.lastChange = System.currentTimeMillis(); storage.makePersistent(this); diff --git a/src/plugins/Spider/db/PerstRoot.java b/src/plugins/Spider/db/PerstRoot.java index 52685f8..78bf448 100644 --- a/src/plugins/Spider/db/PerstRoot.java +++ b/src/plugins/Spider/db/PerstRoot.java @@ -14,6 +14,7 @@ public class PerstRoot extends Persistent { protected FieldIndex idPage; protected FieldIndex uriPage; + protected FieldIndex newPages; protected FieldIndex queuedPages; protected FieldIndex failedPages; protected FieldIndex succeededPages; @@ -30,6 +31,7 @@ public static PerstRoot createRoot(Storage storage) { root.idPage = storage.createFieldIndex(Page.class, "id", true); root.uriPage = storage.createFieldIndex(Page.class, "uri", true); + root.newPages = storage.createFieldIndex(Page.class, "lastChange", false); root.queuedPages = storage.createFieldIndex(Page.class, "lastChange", false); root.failedPages = storage.createFieldIndex(Page.class, "lastChange", false); root.succeededPages = storage.createFieldIndex(Page.class, "lastChange", false); @@ -46,7 +48,7 @@ public static PerstRoot createRoot(Storage storage) { public Page getPageByURI(FreenetURI uri, boolean create, String comment) { idPage.exclusiveLock(); uriPage.exclusiveLock(); - queuedPages.exclusiveLock(); + newPages.exclusiveLock(); try { Page page = uriPage.get(new Key(uri.toString())); @@ -55,12 +57,12 @@ public Page getPageByURI(FreenetURI uri, boolean create, String comment) { idPage.append(page); uriPage.put(page); - queuedPages.put(page); + newPages.put(page); } return page; } finally { - queuedPages.unlock(); + newPages.unlock(); uriPage.unlock(); idPage.unlock(); } @@ -80,6 +82,8 @@ FieldIndex getPageIndex(Status status) { switch (status) { case FAILED: return failedPages; + case NEW: + return newPages; case QUEUED: return queuedPages; case SUCCEEDED: diff --git a/src/plugins/Spider/db/Status.java b/src/plugins/Spider/db/Status.java index 8903940..83c2589 100644 --- a/src/plugins/Spider/db/Status.java +++ b/src/plugins/Spider/db/Status.java @@ -4,9 +4,14 @@ package plugins.Spider.db; public enum Status { + NEW, // Newly found URIs, i.e. queued but never fetched. This puts them priority-wise before QUEUED. /** For simplicity, running is also mark as QUEUED. * NOT_PUSHED, when LibraryBuffer is enabled, means we have successfully fetched the page but have not * yet uploaded the indexed data, so if we have an unclean shutdown we transfer all NOT_PUSHED to QUEUED * so they get re-run. */ - QUEUED, INDEXED, SUCCEEDED, FAILED, NOT_PUSHED + QUEUED, + INDEXED, // The information is sent to library. + SUCCEEDED, // The fetch "succeeded" but we will ignore or not include the result. + FAILED, // The fetch "failed" fatally and we will ignore the result. + NOT_PUSHED } \ No newline at end of file diff --git a/src/plugins/Spider/web/MainPage.java b/src/plugins/Spider/web/MainPage.java index bf7833b..00db874 100644 --- a/src/plugins/Spider/web/MainPage.java +++ b/src/plugins/Spider/web/MainPage.java @@ -79,6 +79,7 @@ public void writeContent(HTTPRequest request, HTMLNode contentNode) { HTMLNode overviewTable = contentNode.addChild("table", "class", "column"); HTMLNode overviewTableRow = overviewTable.addChild("tr"); + PageStatus newStatus = getPageStatus(Status.NEW); PageStatus queuedStatus = getPageStatus(Status.QUEUED); PageStatus indexedStatus = getPageStatus(Status.INDEXED); PageStatus succeededStatus = getPageStatus(Status.SUCCEEDED); @@ -94,6 +95,8 @@ public void writeContent(HTTPRequest request, HTMLNode contentNode) { statusContent.addChild("#", "Running Request: " + runningFetch.size() + "/" + config.getMaxParallelRequests()); statusContent.addChild("br"); + statusContent.addChild("#", "New: " + newStatus.count); + statusContent.addChild("br"); statusContent.addChild("#", "Queued: " + queuedStatus.count); statusContent.addChild("br"); statusContent.addChild("#", "Indexed: " + indexedStatus.count); @@ -157,6 +160,13 @@ public void writeContent(HTTPRequest request, HTMLNode contentNode) { } contentNode.addChild(runningBox); + InfoboxNode newd = pageMaker.getInfobox("New URI"); + HTMLNode newBox = newd.outer; + newBox.addAttribute("style", "right: 0; overflow: auto;"); + HTMLNode newContent = newd.content; + listPages(newStatus, newContent); + contentNode.addChild(newBox); + InfoboxNode queued = pageMaker.getInfobox("Queued URI"); HTMLNode queuedBox = queued.outer; queuedBox.addAttribute("style", "right: 0; overflow: auto;"); From 79c9a8f07ca217addae5b55f509018c2ce61bdd0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Debora=20W=C3=B6pcke?= Date: Sat, 12 Jun 2021 11:29:15 +0200 Subject: [PATCH 04/42] Set comment when an URI is FAILED or SUCCEEDED --- src/plugins/Spider/Spider.java | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/src/plugins/Spider/Spider.java b/src/plugins/Spider/Spider.java index b56fc14..b452f64 100644 --- a/src/plugins/Spider/Spider.java +++ b/src/plugins/Spider/Spider.java @@ -237,6 +237,7 @@ public void startSomeRequests() { } catch (MalformedURLException e) { Logger.error(this, "IMPOSSIBLE-Malformed URI: " + page, e); page.setStatus(Status.FAILED); + page.setComment("MalformedURLException"); } } } finally { @@ -262,6 +263,7 @@ public void startSomeRequests() { } catch (MalformedURLException e) { Logger.error(this, "IMPOSSIBLE-Malformed URI: " + page, e); page.setStatus(Status.FAILED); + page.setComment("MalformedURLException"); } } } finally { @@ -484,6 +486,7 @@ protected void onSuccess(FetchResult result, ClientGetter state, Page page) { } catch (UnsafeContentTypeException e) { // wrong mime type page.setStatus(Status.SUCCEEDED); + page.setComment("UnsafeContentTypeException"); db.endThreadTransaction(); dbTransactionEnded = true; @@ -523,7 +526,10 @@ protected void onSuccess(FetchResult result, ClientGetter state, Page page) { db.beginThreadTransaction(Storage.EXCLUSIVE_TRANSACTION); // page is now invalidated. page = getRoot().getPageByURI(uri, false, ""); - if(page != null) page.setStatus(Status.FAILED); + if(page != null) { + page.setStatus(Status.FAILED); + page.setComment("could not complete operation dbTransaction not ended"); + } db.endThreadTransaction(); } } @@ -546,9 +552,11 @@ protected void onFailure(FetchException fe, ClientGetter getter, Page page) { // redirect, mark as succeeded queueURI(fe.newURI, "redirect from " + getter.getURI(), false); page.setStatus(Status.SUCCEEDED); + page.setComment("Redirected"); } else if (fe.isFatal()) { // too many tries or fatal, mark as failed page.setStatus(Status.FAILED); + page.setComment("Fatal"); } else { // requeue at back page.setStatus(Status.QUEUED); From 84b60c51cb2a38a4a888827d40a35e81a2af7b12 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Debora=20W=C3=B6pcke?= Date: Thu, 17 Jun 2021 14:47:23 +0200 Subject: [PATCH 05/42] Add counting of USK handling --- src/plugins/Spider/Spider.java | 17 +++++++++++++++++ src/plugins/Spider/db/Page.java | 6 +++++- src/plugins/Spider/web/MainPage.java | 7 ++++++- 3 files changed, 28 insertions(+), 2 deletions(-) diff --git a/src/plugins/Spider/Spider.java b/src/plugins/Spider/Spider.java index b452f64..bcb0cf1 100644 --- a/src/plugins/Spider/Spider.java +++ b/src/plugins/Spider/Spider.java @@ -27,6 +27,7 @@ import java.util.concurrent.ThreadFactory; import java.util.concurrent.ThreadPoolExecutor; import java.util.concurrent.TimeUnit; +import java.util.concurrent.atomic.AtomicInteger; import java.util.concurrent.atomic.AtomicLong; import plugins.Spider.index.TermPageEntry; @@ -117,6 +118,10 @@ public long getRealVersion() { private LibraryBuffer librarybuffer; private final AtomicLong lastRequestFinishedAt = new AtomicLong(); + private final AtomicInteger subscribedToUSKs = new AtomicInteger(); + private final AtomicInteger editionsFound = new AtomicInteger(); + + private Map> urisToReplace = Collections.synchronizedMap(new HashMap>()); public int getLibraryBufferSize() { return librarybuffer.bufferUsageEstimate(); @@ -134,6 +139,18 @@ public long getLastRequestFinishedAt() { return lastRequestFinishedAt.get(); } + public int getSubscribedUSKs() { + return subscribedToUSKs.get() - editionsFound.get(); + } + + public int getSubscribedToUSKs() { + return subscribedToUSKs.get(); + } + + public int getEditionsFound() { + return editionsFound.get(); + } + public Config getConfig() { return getRoot().getConfig(); } diff --git a/src/plugins/Spider/db/Page.java b/src/plugins/Spider/db/Page.java index a38e744..0bd4bbb 100644 --- a/src/plugins/Spider/db/Page.java +++ b/src/plugins/Spider/db/Page.java @@ -76,10 +76,14 @@ public String getPageTitle() { return pageTitle; } - public String getLastChange() { + public String getLastChangeAsString() { return new Date(lastChange).toString(); } + public Date getLastChange() { + return new Date(lastChange); + } + @Override public int hashCode() { return (int) (id ^ (id >>> 32)); diff --git a/src/plugins/Spider/web/MainPage.java b/src/plugins/Spider/web/MainPage.java index 00db874..2be5d34 100644 --- a/src/plugins/Spider/web/MainPage.java +++ b/src/plugins/Spider/web/MainPage.java @@ -109,6 +109,11 @@ public void writeContent(HTTPRequest request, HTMLNode contentNode) { statusContent.addChild("br"); statusContent.addChild("#", "Queued Event: " + spider.callbackExecutor.getQueue().size()); statusContent.addChild("br"); + statusContent.addChild("#", "Subscribed USKs: " + spider.getSubscribedUSKs() + + " (subscribed: " + spider.getSubscribedToUSKs() + + " found " + spider.getEditionsFound() + + ")"); + statusContent.addChild("br"); statusContent.addChild("#", "Library buffer size: "+spider.getLibraryBufferSize()); long lastRequestFinishedAt = spider.getLastRequestFinishedAt(); long tStalled = spider.getStalledTime(); @@ -235,7 +240,7 @@ private void listPages(PageStatus pageStatus, HTMLNode parent) { } litem.addChild("p", " " + - page.getLastChange() + " " + + page.getLastChangeAsString() + " " + title + " " + "(" + page.getComment() + ")"); } From f0d8ee9e304a423d4df07d25abcd45f2aad6110d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Debora=20W=C3=B6pcke?= Date: Thu, 17 Jun 2021 15:00:51 +0200 Subject: [PATCH 06/42] Subscribe USKs for queried URIs --- src/plugins/Spider/Spider.java | 28 +++++++++++++++++++++++----- src/plugins/Spider/db/Status.java | 2 +- 2 files changed, 24 insertions(+), 6 deletions(-) diff --git a/src/plugins/Spider/Spider.java b/src/plugins/Spider/Spider.java index bcb0cf1..d3d5f6a 100644 --- a/src/plugins/Spider/Spider.java +++ b/src/plugins/Spider/Spider.java @@ -185,14 +185,15 @@ public void queueURI(FreenetURI uri, String comment, boolean force) { } if (uri.isUSK()) { - if (uri.getSuggestedEdition() < 0) { - uri = uri.setSuggestedEdition((-1) * uri.getSuggestedEdition()); - } + FreenetURI usk = uri; try { + if (uri.getSuggestedEdition() < 0) { + uri = uri.setSuggestedEdition((-1) * uri.getSuggestedEdition()); + } uri = ((USK.create(uri)).getSSK()).getURI(); - (clientContext.uskManager).subscribe(USK.create(uri), this, false, this); - } catch (Exception e) { + } catch (MalformedURLException e) { } + subscribeUSK(usk, uri); } db.beginThreadTransaction(Storage.EXCLUSIVE_TRANSACTION); @@ -219,6 +220,23 @@ public void queueURI(FreenetURI uri, String comment, boolean force) { } } + private void subscribeUSK(FreenetURI uri, FreenetURI uriToReplace) { + USK usk; + try { + usk = USK.create(uri); + } catch (MalformedURLException e) { + return; + } + Set uris = urisToReplace.get(usk); + if (uris == null) { + subscribedToUSKs.getAndIncrement(); + (clientContext.uskManager).subscribe(usk, this, false, this); + uris = new HashSet(); + } + uris.add(uriToReplace); + urisToReplace.put(usk, uris); + } + /** * Start requests from the queue if less than 80% of the max requests are running until the max requests are running. */ diff --git a/src/plugins/Spider/db/Status.java b/src/plugins/Spider/db/Status.java index 83c2589..f2bc1d2 100644 --- a/src/plugins/Spider/db/Status.java +++ b/src/plugins/Spider/db/Status.java @@ -11,7 +11,7 @@ public enum Status { * so they get re-run. */ QUEUED, INDEXED, // The information is sent to library. - SUCCEEDED, // The fetch "succeeded" but we will ignore or not include the result. + SUCCEEDED, // The fetch "succeeded" but we will ignore or not include the result. Also when replaced with a new edition. FAILED, // The fetch "failed" fatally and we will ignore the result. NOT_PUSHED } \ No newline at end of file From 94637a1fba0a75f02efcab8f210764b000e88bd7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Debora=20W=C3=B6pcke?= Date: Thu, 17 Jun 2021 15:02:48 +0200 Subject: [PATCH 07/42] Subscribe to all old USKs --- src/plugins/Spider/Spider.java | 32 +++++++++++++++++++++++++++++++- 1 file changed, 31 insertions(+), 1 deletion(-) diff --git a/src/plugins/Spider/Spider.java b/src/plugins/Spider/Spider.java index d3d5f6a..d178c6b 100644 --- a/src/plugins/Spider/Spider.java +++ b/src/plugins/Spider/Spider.java @@ -245,9 +245,10 @@ public void startSomeRequests() { synchronized (this) { if (stopped) return; + int maxParallelRequests = getRoot().getConfig().getMaxParallelRequests(); + synchronized (runningFetch) { int running = runningFetch.size(); - int maxParallelRequests = getRoot().getConfig().getMaxParallelRequests(); if (running >= maxParallelRequests * 0.8) return; @@ -306,6 +307,35 @@ public void startSomeRequests() { db.endThreadTransaction(); } } + + db.beginThreadTransaction(Storage.EXCLUSIVE_TRANSACTION); + getRoot().exclusiveLock(Status.INDEXED); + try { + Iterator it = getRoot().getPages(Status.INDEXED); + int started = 0; + while (started < maxParallelRequests && it.hasNext()) { + Page page = it.next(); +// if (page.getLastChange().after(new Date().) { +// break; +// } + FreenetURI uri; + try { + uri = new FreenetURI(page.getURI()); + if (uri.isSSKForUSK()) { + USK usk = USK.create(uri.uskForSSK()); + if (urisToReplace.containsKey(usk)) continue; + subscribeUSK(usk.getURI(), uri); + page.setStatus(Status.INDEXED); + started++; + } + } catch (MalformedURLException e) { + // This could not be converted. + } + } + } finally { + getRoot().unlockPages(Status.INDEXED); + db.endThreadTransaction(); + } } for (ClientGetter g : toStart) { From e93e963bb66a3efb35021b8da3038b05bcaa8bdb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Debora=20W=C3=B6pcke?= Date: Thu, 17 Jun 2021 15:17:09 +0200 Subject: [PATCH 08/42] Improve handling in onFoundEdition --- src/plugins/Spider/Spider.java | 30 +++++++++++++++++++----------- 1 file changed, 19 insertions(+), 11 deletions(-) diff --git a/src/plugins/Spider/Spider.java b/src/plugins/Spider/Spider.java index d178c6b..962a1c6 100644 --- a/src/plugins/Spider/Spider.java +++ b/src/plugins/Spider/Spider.java @@ -857,17 +857,25 @@ public void onFinishedPage() { @Override public void onFoundEdition(long l, USK key, ClientContext context, boolean metadata, short codec, byte[] data, boolean newKnownGood, boolean newSlotToo) { - FreenetURI uri = key.getURI(); - /*- - * FIXME this code don't make sense - * (1) runningFetchesByURI contain SSK, not USK - * (2) onFoundEdition always have the edition set - * - if(runningFetchesByURI.containsKey(uri)) runningFetchesByURI.remove(uri); - uri = key.getURI().setSuggestedEdition(l); - */ - queueURI(uri, "USK found edition", true); - startSomeRequests(); + if (newKnownGood) { + Logger.minor(this, "Known Good. Found new Edition for " + key + "."); + Set uris = urisToReplace.remove(key); + if (uris != null) { + for (FreenetURI uri : uris) { + Page page = getRoot().getPageByURI(uri, false, ""); + if (page != null) { + page.setComment("Replaced by new edition " + key); + page.setStatus(Status.SUCCEEDED); + } + } + } + FreenetURI uri = key.getURI(); + queueURI(uri, "USK found edition", true); + startSomeRequests(); + editionsFound.getAndIncrement(); + } else { + Logger.minor(this, "Not Known Good. Edition search continues for " + key + "."); + } } @Override From 2a9fbd9a4a0df91eb0d419f979524e27faf37a22 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Debora=20W=C3=B6pcke?= Date: Thu, 17 Jun 2021 15:19:06 +0200 Subject: [PATCH 09/42] Patch the already created database with new status This is an attempt that was not working. --- src/plugins/Spider/Spider.java | 4 ++++ src/plugins/Spider/db/PerstRoot.java | 8 ++++++++ 2 files changed, 12 insertions(+) diff --git a/src/plugins/Spider/Spider.java b/src/plugins/Spider/Spider.java index 962a1c6..7d1e541 100644 --- a/src/plugins/Spider/Spider.java +++ b/src/plugins/Spider/Spider.java @@ -905,6 +905,10 @@ private Storage initDB() { PerstRoot root = (PerstRoot) db.getRoot(); if (root == null) PerstRoot.createRoot(db); + else { + // Not working: + // PerstRoot.patchRoot(db); + } return db; } diff --git a/src/plugins/Spider/db/PerstRoot.java b/src/plugins/Spider/db/PerstRoot.java index 78bf448..06d09a1 100644 --- a/src/plugins/Spider/db/PerstRoot.java +++ b/src/plugins/Spider/db/PerstRoot.java @@ -141,4 +141,12 @@ public synchronized Config getConfig() { return config; } + public static void patchRoot(Storage storage) { + PerstRoot root = (PerstRoot) storage.getRoot(); + root.newPages = storage.createFieldIndex(Page.class, "lastChange", false); + + root.config = new Config(storage); + storage.setRoot(root); + } + } From e4cc3d2fc4996d4cffda6aee667f995fdb4b1231 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Debora=20W=C3=B6pcke?= Date: Thu, 17 Jun 2021 18:51:10 +0200 Subject: [PATCH 10/42] Factor out the subscribing from within the lock --- src/plugins/Spider/Spider.java | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/src/plugins/Spider/Spider.java b/src/plugins/Spider/Spider.java index 7d1e541..dc75deb 100644 --- a/src/plugins/Spider/Spider.java +++ b/src/plugins/Spider/Spider.java @@ -242,6 +242,7 @@ private void subscribeUSK(FreenetURI uri, FreenetURI uriToReplace) { */ public void startSomeRequests() { ArrayList toStart = null; + List toSubscribe = new ArrayList(); synchronized (this) { if (stopped) return; @@ -322,9 +323,7 @@ public void startSomeRequests() { try { uri = new FreenetURI(page.getURI()); if (uri.isSSKForUSK()) { - USK usk = USK.create(uri.uskForSSK()); - if (urisToReplace.containsKey(usk)) continue; - subscribeUSK(usk.getURI(), uri); + toSubscribe.add(uri); page.setStatus(Status.INDEXED); started++; } @@ -346,6 +345,19 @@ public void startSomeRequests() { g.getClientCallback().onFailure(e, g); } } + + for (FreenetURI uri : toSubscribe) { + USK usk; + try { + usk = USK.create(uri.uskForSSK()); + } catch (MalformedURLException e1) { + continue; + } + if (urisToReplace.containsKey(usk)) { + continue; + } + subscribeUSK(usk.getURI(), uri); + } } /** From 0a95c6694deb6bc6ad8a57bd16c660fe6680d4b5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Debora=20W=C3=B6pcke?= Date: Fri, 18 Jun 2021 15:51:06 +0200 Subject: [PATCH 11/42] Count subscribed USKs and their replaced URIs separately --- src/plugins/Spider/Spider.java | 10 ++++++---- src/plugins/Spider/web/MainPage.java | 9 +++++---- 2 files changed, 11 insertions(+), 8 deletions(-) diff --git a/src/plugins/Spider/Spider.java b/src/plugins/Spider/Spider.java index dc75deb..4d180e7 100644 --- a/src/plugins/Spider/Spider.java +++ b/src/plugins/Spider/Spider.java @@ -119,6 +119,7 @@ public long getRealVersion() { private final AtomicLong lastRequestFinishedAt = new AtomicLong(); private final AtomicInteger subscribedToUSKs = new AtomicInteger(); + private final AtomicInteger replacedByUSKs = new AtomicInteger(); private final AtomicInteger editionsFound = new AtomicInteger(); private Map> urisToReplace = Collections.synchronizedMap(new HashMap>()); @@ -139,14 +140,14 @@ public long getLastRequestFinishedAt() { return lastRequestFinishedAt.get(); } - public int getSubscribedUSKs() { - return subscribedToUSKs.get() - editionsFound.get(); - } - public int getSubscribedToUSKs() { return subscribedToUSKs.get(); } + public int getReplacedByUSKs() { + return replacedByUSKs.get(); + } + public int getEditionsFound() { return editionsFound.get(); } @@ -228,6 +229,7 @@ private void subscribeUSK(FreenetURI uri, FreenetURI uriToReplace) { return; } Set uris = urisToReplace.get(usk); + replacedByUSKs.getAndIncrement(); if (uris == null) { subscribedToUSKs.getAndIncrement(); (clientContext.uskManager).subscribe(usk, this, false, this); diff --git a/src/plugins/Spider/web/MainPage.java b/src/plugins/Spider/web/MainPage.java index 2be5d34..ccf2b68 100644 --- a/src/plugins/Spider/web/MainPage.java +++ b/src/plugins/Spider/web/MainPage.java @@ -109,10 +109,11 @@ public void writeContent(HTTPRequest request, HTMLNode contentNode) { statusContent.addChild("br"); statusContent.addChild("#", "Queued Event: " + spider.callbackExecutor.getQueue().size()); statusContent.addChild("br"); - statusContent.addChild("#", "Subscribed USKs: " + spider.getSubscribedUSKs() + - " (subscribed: " + spider.getSubscribedToUSKs() + - " found " + spider.getEditionsFound() + - ")"); + statusContent.addChild("#", "Subscribed USKs: " + spider.getSubscribedToUSKs()); + statusContent.addChild("br"); + statusContent.addChild("#", "URIs replaced by the subscribed USKs: " + spider.getReplacedByUSKs()); + statusContent.addChild("br"); + statusContent.addChild("#", "Found editions: " + spider.getEditionsFound()); statusContent.addChild("br"); statusContent.addChild("#", "Library buffer size: "+spider.getLibraryBufferSize()); long lastRequestFinishedAt = spider.getLastRequestFinishedAt(); From 2d910f159a1b59cd2d9065e657d0e5a4181aff76 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Debora=20W=C3=B6pcke?= Date: Fri, 18 Jun 2021 19:31:13 +0200 Subject: [PATCH 12/42] Change to trigger fetches regularly instead of on event --- src/plugins/Spider/Spider.java | 116 ++++++++++++--------------- src/plugins/Spider/web/MainPage.java | 1 - 2 files changed, 50 insertions(+), 67 deletions(-) diff --git a/src/plugins/Spider/Spider.java b/src/plugins/Spider/Spider.java index 4d180e7..bedcd60 100644 --- a/src/plugins/Spider/Spider.java +++ b/src/plugins/Spider/Spider.java @@ -23,9 +23,8 @@ import java.util.Locale; import java.util.Map; import java.util.Set; -import java.util.concurrent.PriorityBlockingQueue; +import java.util.concurrent.ScheduledThreadPoolExecutor; import java.util.concurrent.ThreadFactory; -import java.util.concurrent.ThreadPoolExecutor; import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicInteger; import java.util.concurrent.atomic.AtomicLong; @@ -240,21 +239,18 @@ private void subscribeUSK(FreenetURI uri, FreenetURI uriToReplace) { } /** - * Start requests from the queue if less than 80% of the max requests are running until the max requests are running. + * Start requests from new and queued. */ - public void startSomeRequests() { + private void startFetches() { ArrayList toStart = null; List toSubscribe = new ArrayList(); synchronized (this) { if (stopped) return; - int maxParallelRequests = getRoot().getConfig().getMaxParallelRequests(); - synchronized (runningFetch) { + int maxParallelRequests = getRoot().getConfig().getMaxParallelRequests(); int running = runningFetch.size(); - if (running >= maxParallelRequests * 0.8) return; - // Prepare to start toStart = new ArrayList(maxParallelRequests - running); db.beginThreadTransaction(Storage.COOPERATIVE_TRANSACTION); @@ -310,6 +306,27 @@ public void startSomeRequests() { db.endThreadTransaction(); } } + } + + for (ClientGetter g : toStart) { + try { + g.start(clientContext); + Logger.minor(this, g + " started"); + } catch (FetchException e) { + g.getClientCallback().onFailure(e, g); + } + } + } + + /** + * Subscribe to USKs for indexed. + */ + private void startSubscribeUSKs() { + List toSubscribe = new ArrayList(); + synchronized (this) { + if (stopped) return; + + int maxParallelRequests = 2 * getRoot().getConfig().getMaxParallelRequests(); db.beginThreadTransaction(Storage.EXCLUSIVE_TRANSACTION); getRoot().exclusiveLock(Status.INDEXED); @@ -318,9 +335,6 @@ public void startSomeRequests() { int started = 0; while (started < maxParallelRequests && it.hasNext()) { Page page = it.next(); -// if (page.getLastChange().after(new Date().) { -// break; -// } FreenetURI uri; try { uri = new FreenetURI(page.getURI()); @@ -339,15 +353,6 @@ public void startSomeRequests() { } } - for (ClientGetter g : toStart) { - try { - g.start(clientContext); - Logger.minor(this, g + " started"); - } catch (FetchException e) { - g.getClientCallback().onFailure(e, g); - } - } - for (FreenetURI uri : toSubscribe) { USK usk; try { @@ -457,55 +462,17 @@ protected class SetConfigCallback implements Runnable { public void run() { synchronized (getRoot()) { getRoot().setConfig(config); - startSomeRequests(); - } - } - } - - protected class StartSomeRequestsCallback implements Runnable { - StartSomeRequestsCallback() { - } - - public void run() { - try { - Thread.sleep(30000); - } catch (InterruptedException e) { - // ignore - } - startSomeRequests(); - } - } - - protected static class CallbackPrioritizer implements Comparator { - public int compare(Runnable o1, Runnable o2) { - if (o1.getClass() == o2.getClass()) return 0; - - return getPriority(o1) - getPriority(o2); - } - - private int getPriority(Runnable r) { - if (r instanceof SetConfigCallback) { - return 0; - } else if (r instanceof OnFailureCallback) { - return 2; - } else if (r instanceof OnSuccessCallback) { - return 3; - } else if (r instanceof StartSomeRequestsCallback) { - return 4; } - - return -1; } } // this is java.util.concurrent.Executor, not freenet.support.Executor // always run with one thread --> more thread cause contention and slower! - public ThreadPoolExecutor callbackExecutor = new ThreadPoolExecutor( // - 1, 1, 600, TimeUnit.SECONDS, // - new PriorityBlockingQueue(5, new CallbackPrioritizer()), // + public ScheduledThreadPoolExecutor callbackExecutor = new ScheduledThreadPoolExecutor( + 1, new ThreadFactory() { public Thread newThread(Runnable r) { - Thread t = new NativeThread(r, "Spider", NativeThread.NORM_PRIORITY - 1, true); + Thread t = new NativeThread(r, "Spider", NativeThread.PriorityLevel.NORM_PRIORITY.value - 1, true); t.setDaemon(true); t.setContextClassLoader(Spider.this.getClass().getClassLoader()); return t; @@ -597,7 +564,6 @@ protected void onSuccess(FetchResult result, ClientGetter state, Page page) { synchronized (this) { runningFetch.remove(page); } - if (!stopped) startSomeRequests(); } finally { if (!dbTransactionEnded) { Logger.minor(this, "rollback transaction", new Exception("debug")); @@ -653,8 +619,6 @@ protected void onFailure(FetchException fe, ClientGetter getter, Page page) { db.rollbackThreadTransaction(); } } - - startSomeRequests(); } private boolean garbageCollecting = false; @@ -724,7 +688,28 @@ public synchronized void runPlugin(PluginRespirator pr) { librarybuffer = new LibraryBuffer(pr, this); librarybuffer.start(); - callbackExecutor.execute(new StartSomeRequestsCallback()); + callbackExecutor.scheduleWithFixedDelay(new Runnable() { + @Override + public void run() { + try { + startFetches(); + } catch (Throwable e) { + Logger.error(this, "startFetches throws", e); + } + } + + }, 30, 30, TimeUnit.SECONDS); + callbackExecutor.scheduleWithFixedDelay(new Runnable() { + @Override + public void run() { + try { + startSubscribeUSKs(); + } catch (Throwable e) { + Logger.error(this, "startSubscribeUSKs throws", e); + } + } + + }, 130, 60, TimeUnit.SECONDS); } private WebInterface webInterface; @@ -885,7 +870,6 @@ public void onFoundEdition(long l, USK key, ClientContext context, boolean metad } FreenetURI uri = key.getURI(); queueURI(uri, "USK found edition", true); - startSomeRequests(); editionsFound.getAndIncrement(); } else { Logger.minor(this, "Not Known Good. Edition search continues for " + key + "."); diff --git a/src/plugins/Spider/web/MainPage.java b/src/plugins/Spider/web/MainPage.java index ccf2b68..ac1e937 100644 --- a/src/plugins/Spider/web/MainPage.java +++ b/src/plugins/Spider/web/MainPage.java @@ -65,7 +65,6 @@ public void processPostRequest(HTTPRequest request, HTMLNode contentNode) { addChild("#", e.getMessage()); Logger.normal(this, "Manual added URI cause exception", e); } - spider.startSomeRequests(); } } From fba4c9b68892cfbf44075a959b75f9df91694d8d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Debora=20W=C3=B6pcke?= Date: Sun, 8 Aug 2021 15:45:13 +0200 Subject: [PATCH 13/42] Abort a fetch after 10 hours with a failure. --- src/plugins/Spider/Spider.java | 36 +++++++++++++++++++++++++++++----- 1 file changed, 31 insertions(+), 5 deletions(-) diff --git a/src/plugins/Spider/Spider.java b/src/plugins/Spider/Spider.java index bedcd60..cf17c82 100644 --- a/src/plugins/Spider/Spider.java +++ b/src/plugins/Spider/Spider.java @@ -23,6 +23,7 @@ import java.util.Locale; import java.util.Map; import java.util.Set; +import java.util.concurrent.ScheduledFuture; import java.util.concurrent.ScheduledThreadPoolExecutor; import java.util.concurrent.ThreadFactory; import java.util.concurrent.TimeUnit; @@ -83,6 +84,8 @@ public class Spider implements FredPlugin, FredPluginThreadless, /** Document ID of fetching documents */ protected Map runningFetch = Collections.synchronizedMap(new HashMap()); + private Map> runningFutures = Collections.synchronizedMap(new HashMap>()); + /** * Lists the allowed mime types of the fetched page. */ @@ -308,13 +311,25 @@ private void startFetches() { } } - for (ClientGetter g : toStart) { + for (final ClientGetter g : toStart) { try { g.start(clientContext); Logger.minor(this, g + " started"); } catch (FetchException e) { g.getClientCallback().onFailure(e, g); + continue; } + ScheduledFuture future = callbackExecutor.scheduleWithFixedDelay(new Runnable() { + long lapsLeft = 10 * 60 * 60; + @Override + public void run() { + if (lapsLeft-- <= 0) { + g.cancel(clientContext); + Logger.minor(this, g + " aborted because of time-out"); + } + } + }, 10, 1, TimeUnit.SECONDS); + runningFutures.put(g, future); } } @@ -377,20 +392,31 @@ public ClientGetterCallback(Page page) { this.page = page; } - @Override + @Override public void onFailure(FetchException e, ClientGetter state) { + Logger.minor(this, "onFailure: " + page + " (q:" + callbackExecutor.getQueue().size() + ")"); + removeFuture(state); + if (stopped) return; callbackExecutor.execute(new OnFailureCallback(e, state, page)); - Logger.minor(this, "Queued OnFailure: " + page + " (q:" + callbackExecutor.getQueue().size() + ")"); } - @Override + @Override public void onSuccess(final FetchResult result, final ClientGetter state) { + Logger.minor(this, "onSuccess: " + page + " (q:" + callbackExecutor.getQueue().size() + ")"); + removeFuture(state); + if (stopped) return; callbackExecutor.execute(new OnSuccessCallback(result, state, page)); - Logger.minor(this, "Queued OnSuccess: " + page + " (q:" + callbackExecutor.getQueue().size() + ")"); + } + + private void removeFuture(ClientGetter getter) { + ScheduledFuture future = runningFutures.remove(getter); + if (future != null) { + future.cancel(false); + } } public String toString() { From 4991a42e5534bbf2f5585f411d4c706f4e5e117c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Debora=20W=C3=B6pcke?= Date: Sun, 9 Jan 2022 13:31:31 +0100 Subject: [PATCH 14/42] Improve logging --- src/plugins/Spider/Spider.java | 4 ++-- src/plugins/Spider/db/Page.java | 4 +++- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/src/plugins/Spider/Spider.java b/src/plugins/Spider/Spider.java index cf17c82..46d3694 100644 --- a/src/plugins/Spider/Spider.java +++ b/src/plugins/Spider/Spider.java @@ -269,7 +269,7 @@ private void startFetches() { try { ClientGetter getter = makeGetter(page); - Logger.minor(this, "Starting " + getter + " " + page); + Logger.minor(this, "Starting new " + getter + " " + page); toStart.add(getter); runningFetch.put(page, getter); } catch (MalformedURLException e) { @@ -295,7 +295,7 @@ private void startFetches() { try { ClientGetter getter = makeGetter(page); - Logger.minor(this, "Starting " + getter + " " + page); + Logger.minor(this, "Starting queued " + getter + " " + page); toStart.add(getter); runningFetch.put(page, getter); } catch (MalformedURLException e) { diff --git a/src/plugins/Spider/db/Page.java b/src/plugins/Spider/db/Page.java index 0bd4bbb..477c531 100644 --- a/src/plugins/Spider/db/Page.java +++ b/src/plugins/Spider/db/Page.java @@ -127,8 +127,10 @@ private void preModify() { Logger.error(this, "Page: Key not found in index: "+this, e); System.err.println("Page: Key not found in index: "+this); e.printStackTrace(); - } else + } else { + Logger.error(this, "remove from index " + status + " failed", e); throw e; + } } finally { coll.unlock(); } From e3506d32e64c8e09b60e01cab77a76cbb606cf99 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Debora=20W=C3=B6pcke?= Date: Sun, 9 Jan 2022 14:16:15 +0100 Subject: [PATCH 15/42] Move the changes of the page out of the lock in startSubscribeUSK The change of the page affects the list of pages and might confuse the iterator. --- src/plugins/Spider/Spider.java | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/src/plugins/Spider/Spider.java b/src/plugins/Spider/Spider.java index 46d3694..8b5d874 100644 --- a/src/plugins/Spider/Spider.java +++ b/src/plugins/Spider/Spider.java @@ -22,6 +22,7 @@ import java.util.List; import java.util.Locale; import java.util.Map; +import java.util.Map.Entry; import java.util.Set; import java.util.concurrent.ScheduledFuture; import java.util.concurrent.ScheduledThreadPoolExecutor; @@ -337,14 +338,14 @@ public void run() { * Subscribe to USKs for indexed. */ private void startSubscribeUSKs() { - List toSubscribe = new ArrayList(); + Map toSubscribe = new HashMap(); synchronized (this) { if (stopped) return; int maxParallelRequests = 2 * getRoot().getConfig().getMaxParallelRequests(); db.beginThreadTransaction(Storage.EXCLUSIVE_TRANSACTION); - getRoot().exclusiveLock(Status.INDEXED); + getRoot().sharedLockPages(Status.INDEXED); try { Iterator it = getRoot().getPages(Status.INDEXED); int started = 0; @@ -354,12 +355,11 @@ private void startSubscribeUSKs() { try { uri = new FreenetURI(page.getURI()); if (uri.isSSKForUSK()) { - toSubscribe.add(uri); - page.setStatus(Status.INDEXED); + toSubscribe.put(uri, page); started++; } } catch (MalformedURLException e) { - // This could not be converted. + // This could not be converted - ignore. } } } finally { @@ -368,17 +368,23 @@ private void startSubscribeUSKs() { } } - for (FreenetURI uri : toSubscribe) { + for (Entry entry : toSubscribe.entrySet()) { + FreenetURI uri = entry.getKey(); + Page page = entry.getValue(); USK usk; try { usk = USK.create(uri.uskForSSK()); } catch (MalformedURLException e1) { + page.setComment("MalformedURL in SubscribeUSK"); continue; } if (urisToReplace.containsKey(usk)) { + // Everything is subscribed to. continue; } + subscribeUSK(usk.getURI(), uri); + page.setStatus(Status.INDEXED); // Move last. } } From ca0aece2288abf3d9e6329c9e7562eed3c65175c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Debora=20W=C3=B6pcke?= Date: Tue, 18 Jan 2022 17:45:23 +0100 Subject: [PATCH 16/42] Avoid searching for new jobs if there isn't any slots available --- src/plugins/Spider/Spider.java | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/plugins/Spider/Spider.java b/src/plugins/Spider/Spider.java index 8b5d874..db3c998 100644 --- a/src/plugins/Spider/Spider.java +++ b/src/plugins/Spider/Spider.java @@ -255,6 +255,10 @@ private void startFetches() { int maxParallelRequests = getRoot().getConfig().getMaxParallelRequests(); int running = runningFetch.size(); + if (maxParallelRequests <= running) { + return; + } + // Prepare to start toStart = new ArrayList(maxParallelRequests - running); db.beginThreadTransaction(Storage.COOPERATIVE_TRANSACTION); From 8d34f0ced6a85dbf139f52ab52ca9d7ee6e22aa8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Debora=20W=C3=B6pcke?= Date: Tue, 18 Jan 2022 17:48:34 +0100 Subject: [PATCH 17/42] Cleaned code according to Eclipse suggestions --- src/plugins/Spider/Spider.java | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/plugins/Spider/Spider.java b/src/plugins/Spider/Spider.java index db3c998..045d31f 100644 --- a/src/plugins/Spider/Spider.java +++ b/src/plugins/Spider/Spider.java @@ -15,7 +15,6 @@ import java.net.URI; import java.util.ArrayList; import java.util.Collections; -import java.util.Comparator; import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; @@ -247,7 +246,6 @@ private void subscribeUSK(FreenetURI uri, FreenetURI uriToReplace) { */ private void startFetches() { ArrayList toStart = null; - List toSubscribe = new ArrayList(); synchronized (this) { if (stopped) return; @@ -850,7 +848,7 @@ void finish() { } } - HashMap tpes = new HashMap(); + HashMap tpes = new HashMap(); /** * Add a word to the database for this page From 74045cccf584ff3a5965b8f0972b0346485f9e35 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Debora=20W=C3=B6pcke?= Date: Fri, 14 Jan 2022 17:46:35 +0100 Subject: [PATCH 18/42] Attempt to move entries to the right list Also added more logging. --- src/plugins/Spider/Spider.java | 69 +++++++++++++++++++++------- src/plugins/Spider/db/Page.java | 41 +++++++++++++++++ src/plugins/Spider/db/PerstRoot.java | 2 + 3 files changed, 95 insertions(+), 17 deletions(-) diff --git a/src/plugins/Spider/Spider.java b/src/plugins/Spider/Spider.java index 045d31f..ef0b37e 100644 --- a/src/plugins/Spider/Spider.java +++ b/src/plugins/Spider/Spider.java @@ -259,16 +259,23 @@ private void startFetches() { // Prepare to start toStart = new ArrayList(maxParallelRequests - running); - db.beginThreadTransaction(Storage.COOPERATIVE_TRANSACTION); - getRoot().sharedLockPages(Status.NEW); + Page pageInWrongList = null; + db.beginThreadTransaction(Storage.EXCLUSIVE_TRANSACTION); try { Iterator it = getRoot().getPages(Status.NEW); while (running + toStart.size() < maxParallelRequests && it.hasNext()) { Page page = it.next(); + Logger.debug(this, "Page " + page + " found in NEW."); // Skip if getting this page already if (runningFetch.containsKey(page)) continue; + final Status status = page.getStatus(); + if (status != Status.NEW) { + pageInWrongList = page; + continue; + } + try { ClientGetter getter = makeGetter(page); @@ -282,19 +289,28 @@ private void startFetches() { } } } finally { - getRoot().unlockPages(Status.NEW); + if (pageInWrongList != null) { + pageInWrongList.pageFoundInWrongList(); + } db.endThreadTransaction(); } - db.beginThreadTransaction(Storage.COOPERATIVE_TRANSACTION); - getRoot().sharedLockPages(Status.QUEUED); + db.beginThreadTransaction(Storage.EXCLUSIVE_TRANSACTION); + pageInWrongList = null; try { Iterator it = getRoot().getPages(Status.QUEUED); while (running + toStart.size() < maxParallelRequests && it.hasNext()) { Page page = it.next(); + Logger.debug(this, "Page " + page + " found in QUEUED."); // Skip if getting this page already if (runningFetch.containsKey(page)) continue; + final Status status = page.getStatus(); + if (status != Status.QUEUED) { + pageInWrongList = page; + continue; + } + try { ClientGetter getter = makeGetter(page); @@ -308,7 +324,9 @@ private void startFetches() { } } } finally { - getRoot().unlockPages(Status.QUEUED); + if (pageInWrongList != null) { + pageInWrongList.pageFoundInWrongList(); + } db.endThreadTransaction(); } } @@ -347,12 +365,12 @@ private void startSubscribeUSKs() { int maxParallelRequests = 2 * getRoot().getConfig().getMaxParallelRequests(); db.beginThreadTransaction(Storage.EXCLUSIVE_TRANSACTION); - getRoot().sharedLockPages(Status.INDEXED); try { Iterator it = getRoot().getPages(Status.INDEXED); int started = 0; while (started < maxParallelRequests && it.hasNext()) { Page page = it.next(); + Logger.debug(this, "Page " + page + " found in INDEXED."); FreenetURI uri; try { uri = new FreenetURI(page.getURI()); @@ -365,7 +383,6 @@ private void startSubscribeUSKs() { } } } finally { - getRoot().unlockPages(Status.INDEXED); db.endThreadTransaction(); } } @@ -377,7 +394,12 @@ private void startSubscribeUSKs() { try { usk = USK.create(uri.uskForSSK()); } catch (MalformedURLException e1) { - page.setComment("MalformedURL in SubscribeUSK"); + db.beginThreadTransaction(Storage.EXCLUSIVE_TRANSACTION); + try { + page.setComment("MalformedURL in SubscribeUSK"); + } finally { + db.endThreadTransaction(); + } continue; } if (urisToReplace.containsKey(usk)) { @@ -386,7 +408,13 @@ private void startSubscribeUSKs() { } subscribeUSK(usk.getURI(), uri); - page.setStatus(Status.INDEXED); // Move last. + db.beginThreadTransaction(Storage.EXCLUSIVE_TRANSACTION); + try { + page.setStatus(Status.INDEXED); // Move last. + } finally { + db.endThreadTransaction(); + } + } } @@ -894,16 +922,21 @@ public void onFoundEdition(long l, USK key, ClientContext context, boolean metad Logger.minor(this, "Known Good. Found new Edition for " + key + "."); Set uris = urisToReplace.remove(key); if (uris != null) { - for (FreenetURI uri : uris) { - Page page = getRoot().getPageByURI(uri, false, ""); - if (page != null) { - page.setComment("Replaced by new edition " + key); - page.setStatus(Status.SUCCEEDED); + db.beginThreadTransaction(Storage.EXCLUSIVE_TRANSACTION); + try { + for (FreenetURI uri : uris) { + Page page = getRoot().getPageByURI(uri, false, ""); + if (page != null) { + page.setComment("Replaced by new edition " + key); + page.setStatus(Status.SUCCEEDED); + } } + } finally { + db.endThreadTransaction(); } } FreenetURI uri = key.getURI(); - queueURI(uri, "USK found edition", true); + queueURI(uri, "USK found edition " + uri, true); editionsFound.getAndIncrement(); } else { Logger.minor(this, "Not Known Good. Edition search continues for " + key + "."); @@ -990,7 +1023,9 @@ public void resetPages(Status from, Status to) { int count = 0; Iterator pages = getRoot().getPages(from); while(pages.hasNext()) { - pages.next().setStatus(to); + Page page = pages.next(); + Logger.debug(this, "Page " + page + " found in " + from + "."); + page.setStatus(to); count++; } System.out.println("Reset "+count+" pages status from "+from+" to "+to); diff --git a/src/plugins/Spider/db/Page.java b/src/plugins/Spider/db/Page.java index 477c531..473fbae 100644 --- a/src/plugins/Spider/db/Page.java +++ b/src/plugins/Spider/db/Page.java @@ -39,6 +39,7 @@ public Page() { } public synchronized void setStatus(Status status) { + Logger.debug(this, "New status " + status + " for " + this); preModify(); this.status = status; postModify(); @@ -49,6 +50,7 @@ public Status getStatus() { } public synchronized void setComment(String comment) { + Logger.debug(this, "New comment for " + this); preModify(); this.comment = comment; postModify(); @@ -67,6 +69,7 @@ public long getId() { } public void setPageTitle(String pageTitle) { + Logger.debug(this, "New page title for " + this); preModify(); this.pageTitle = pageTitle; postModify(); @@ -155,4 +158,42 @@ private void postModify() { } } } + + /** + * Called when we find the page in the wrong list. + * + * This should never happen but it has and is a major problem since it + * locks up the search. + */ + public void pageFoundInWrongList() { + Storage storage = getStorage(); + + if (storage != null) { + PerstRoot root = (PerstRoot) storage.getRoot(); + Logger.error(this, "Page " + this + " found in wrong list. Will remove from all lists and put back."); + for (Status status : Status.values()) { + FieldIndex coll = root.getPageIndex(status); + coll.exclusiveLock(); + try { + coll.remove(this); + Logger.minor(this, "Page " + this + " was removed from " + status); + } catch (StorageError e) { + if(e.getErrorCode() == StorageError.KEY_NOT_FOUND) { + // This is the normal case. + } else { + Logger.error(this, "Error in storage when removing " + this + " from " + status + ".", e); + } + } finally { + coll.unlock(); + } + } + FieldIndex coll = root.getPageIndex(status); + coll.exclusiveLock(); + try { + coll.put(this); + } finally { + coll.unlock(); + } + } + } } diff --git a/src/plugins/Spider/db/PerstRoot.java b/src/plugins/Spider/db/PerstRoot.java index 06d09a1..4ca5a18 100644 --- a/src/plugins/Spider/db/PerstRoot.java +++ b/src/plugins/Spider/db/PerstRoot.java @@ -9,6 +9,7 @@ import plugins.Spider.org.garret.perst.Persistent; import plugins.Spider.org.garret.perst.Storage; import freenet.keys.FreenetURI; +import freenet.support.Logger; public class PerstRoot extends Persistent { @@ -53,6 +54,7 @@ public Page getPageByURI(FreenetURI uri, boolean create, String comment) { Page page = uriPage.get(new Key(uri.toString())); if (create && page == null) { + Logger.debug(this, "New page created for " + uri.toString()); page = new Page(uri.toString(), comment, getStorage()); idPage.append(page); From 1b4a53800631fa912af82dd59a79d73bd91bbf51 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Debora=20W=C3=B6pcke?= Date: Fri, 14 Jan 2022 20:08:08 +0100 Subject: [PATCH 19/42] Factor out the list of variables to index it on Status --- src/plugins/Spider/db/PerstRoot.java | 63 +++++++---------------- src/plugins/Spider/web/MainPage.java | 75 +++++----------------------- 2 files changed, 31 insertions(+), 107 deletions(-) diff --git a/src/plugins/Spider/db/PerstRoot.java b/src/plugins/Spider/db/PerstRoot.java index 4ca5a18..bcddc9b 100644 --- a/src/plugins/Spider/db/PerstRoot.java +++ b/src/plugins/Spider/db/PerstRoot.java @@ -1,10 +1,10 @@ package plugins.Spider.db; +import java.util.HashMap; import java.util.Iterator; -import java.util.List; +import java.util.Map; import plugins.Spider.org.garret.perst.FieldIndex; -import plugins.Spider.org.garret.perst.IterableIterator; import plugins.Spider.org.garret.perst.Key; import plugins.Spider.org.garret.perst.Persistent; import plugins.Spider.org.garret.perst.Storage; @@ -15,12 +15,7 @@ public class PerstRoot extends Persistent { protected FieldIndex idPage; protected FieldIndex uriPage; - protected FieldIndex newPages; - protected FieldIndex queuedPages; - protected FieldIndex failedPages; - protected FieldIndex succeededPages; - protected FieldIndex notPushedPages; - protected FieldIndex indexedPages; + Map> statusPages = new HashMap>(); private Config config; @@ -30,26 +25,27 @@ public PerstRoot() { public static PerstRoot createRoot(Storage storage) { PerstRoot root = new PerstRoot(); - root.idPage = storage.createFieldIndex(Page.class, "id", true); - root.uriPage = storage.createFieldIndex(Page.class, "uri", true); - root.newPages = storage.createFieldIndex(Page.class, "lastChange", false); - root.queuedPages = storage.createFieldIndex(Page.class, "lastChange", false); - root.failedPages = storage.createFieldIndex(Page.class, "lastChange", false); - root.succeededPages = storage.createFieldIndex(Page.class, "lastChange", false); - root.notPushedPages = storage.createFieldIndex(Page.class, "lastChange", false); - root.indexedPages = storage.createFieldIndex(Page.class, "lastChange", false); - - root.config = new Config(storage); + root.create(storage); storage.setRoot(root); return root; } + + private void create(Storage storage) { + idPage = storage.createFieldIndex(Page.class, "id", true); + uriPage = storage.createFieldIndex(Page.class, "uri", true); + for (Status status : Status.values()) { + statusPages.put(status, storage.createFieldIndex(Page.class, "lastChange", true)); + } + + config = new Config(storage); + } public Page getPageByURI(FreenetURI uri, boolean create, String comment) { idPage.exclusiveLock(); uriPage.exclusiveLock(); - newPages.exclusiveLock(); + statusPages.get(Status.NEW).exclusiveLock(); try { Page page = uriPage.get(new Key(uri.toString())); @@ -59,12 +55,12 @@ public Page getPageByURI(FreenetURI uri, boolean create, String comment) { idPage.append(page); uriPage.put(page); - newPages.put(page); + statusPages.get(Status.NEW).put(page); } return page; } finally { - newPages.unlock(); + statusPages.get(Status.NEW).unlock(); uriPage.unlock(); idPage.unlock(); } @@ -81,22 +77,7 @@ public Page getPageById(long id) { } FieldIndex getPageIndex(Status status) { - switch (status) { - case FAILED: - return failedPages; - case NEW: - return newPages; - case QUEUED: - return queuedPages; - case SUCCEEDED: - return succeededPages; - case NOT_PUSHED: - return notPushedPages; - case INDEXED: - return indexedPages; - default: - return null; - } + return statusPages.get(status); } public void exclusiveLock(Status status) { @@ -143,12 +124,4 @@ public synchronized Config getConfig() { return config; } - public static void patchRoot(Storage storage) { - PerstRoot root = (PerstRoot) storage.getRoot(); - root.newPages = storage.createFieldIndex(Page.class, "lastChange", false); - - root.config = new Config(storage); - storage.setRoot(root); - } - } diff --git a/src/plugins/Spider/web/MainPage.java b/src/plugins/Spider/web/MainPage.java index ac1e937..5715d66 100644 --- a/src/plugins/Spider/web/MainPage.java +++ b/src/plugins/Spider/web/MainPage.java @@ -5,7 +5,6 @@ package plugins.Spider.web; import java.util.ArrayList; -import java.util.Date; import java.util.Iterator; import java.util.List; @@ -78,13 +77,6 @@ public void writeContent(HTTPRequest request, HTMLNode contentNode) { HTMLNode overviewTable = contentNode.addChild("table", "class", "column"); HTMLNode overviewTableRow = overviewTable.addChild("tr"); - PageStatus newStatus = getPageStatus(Status.NEW); - PageStatus queuedStatus = getPageStatus(Status.QUEUED); - PageStatus indexedStatus = getPageStatus(Status.INDEXED); - PageStatus succeededStatus = getPageStatus(Status.SUCCEEDED); - PageStatus failedStatus = getPageStatus(Status.FAILED); - PageStatus notPushedStatus = getPageStatus(Status.NOT_PUSHED); - List runningFetch = spider.getRunningFetch(); Config config = spider.getConfig(); @@ -94,18 +86,10 @@ public void writeContent(HTTPRequest request, HTMLNode contentNode) { statusContent.addChild("#", "Running Request: " + runningFetch.size() + "/" + config.getMaxParallelRequests()); statusContent.addChild("br"); - statusContent.addChild("#", "New: " + newStatus.count); - statusContent.addChild("br"); - statusContent.addChild("#", "Queued: " + queuedStatus.count); - statusContent.addChild("br"); - statusContent.addChild("#", "Indexed: " + indexedStatus.count); - statusContent.addChild("br"); - statusContent.addChild("#", "Succeeded: " + succeededStatus.count); - statusContent.addChild("br"); - statusContent.addChild("#", "Not pushed: " + notPushedStatus.count); - statusContent.addChild("br"); - statusContent.addChild("#", "Failed: " + failedStatus.count); - statusContent.addChild("br"); + for (Status status : Status.values()) { + statusContent.addChild("#", status + ": " + getPageStatus(status).count); + statusContent.addChild("br"); + } statusContent.addChild("#", "Queued Event: " + spider.callbackExecutor.getQueue().size()); statusContent.addChild("br"); statusContent.addChild("#", "Subscribed USKs: " + spider.getSubscribedToUSKs()); @@ -165,47 +149,14 @@ public void writeContent(HTTPRequest request, HTMLNode contentNode) { } contentNode.addChild(runningBox); - InfoboxNode newd = pageMaker.getInfobox("New URI"); - HTMLNode newBox = newd.outer; - newBox.addAttribute("style", "right: 0; overflow: auto;"); - HTMLNode newContent = newd.content; - listPages(newStatus, newContent); - contentNode.addChild(newBox); - - InfoboxNode queued = pageMaker.getInfobox("Queued URI"); - HTMLNode queuedBox = queued.outer; - queuedBox.addAttribute("style", "right: 0; overflow: auto;"); - HTMLNode queuedContent = queued.content; - listPages(queuedStatus, queuedContent); - contentNode.addChild(queuedBox); - - InfoboxNode indexed = pageMaker.getInfobox("Indexed URI"); - HTMLNode indexedBox = indexed.outer; - indexedBox.addAttribute("style", "right: 0;"); - HTMLNode indexedContent = indexed.content; - listPages(indexedStatus, indexedContent); - contentNode.addChild(indexedBox); - - InfoboxNode succeeded = pageMaker.getInfobox("Succeeded URI"); - HTMLNode succeededBox = succeeded.outer; - succeededBox.addAttribute("style", "right: 0;"); - HTMLNode succeededContent = succeeded.content; - listPages(succeededStatus, succeededContent); - contentNode.addChild(succeededBox); - - InfoboxNode notPushed = pageMaker.getInfobox("Not pushed URI"); - HTMLNode notPushedBox = notPushed.outer; - notPushedBox.addAttribute("style", "right: 0;"); - HTMLNode notPushedContent = notPushed.content; - listPages(notPushedStatus, notPushedContent); - contentNode.addChild(notPushedBox); - - InfoboxNode failed = pageMaker.getInfobox("Failed URI"); - HTMLNode failedBox = failed.outer; - failedBox.addAttribute("style", "right: 0;"); - HTMLNode failedContent = failed.content; - listPages(failedStatus, failedContent); - contentNode.addChild(failedBox); + for (Status status : Status.values()) { + InfoboxNode d = pageMaker.getInfobox(status + " URIs"); + HTMLNode box = d.outer; + box.addAttribute("style", "right: 0; overflow: auto;"); + HTMLNode content = d.content; + listPages(getPageStatus(status), content); + contentNode.addChild(box); + } } //-- Utilities @@ -216,7 +167,7 @@ private PageStatus getPageStatus(Status status) { Iterator it = root.getPages(status); int showURI = spider.getConfig().getMaxShownURIs(); - List page = new ArrayList(); + List page = new ArrayList(); while (page.size() < showURI && it.hasNext()) { page.add(it.next()); } From 0b3d20e3b96339134fcdd154bc68a26a3724f314 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Debora=20W=C3=B6pcke?= Date: Wed, 19 Jan 2022 18:26:17 +0100 Subject: [PATCH 20/42] Partial work for restructuring of pages USKs treated as USKs, ... --- src/plugins/Spider/LibraryBuffer.java | 13 ++- src/plugins/Spider/Spider.java | 111 ++++++++++++++------------ src/plugins/Spider/db/Status.java | 22 +++-- src/plugins/Spider/web/MainPage.java | 2 +- 4 files changed, 83 insertions(+), 65 deletions(-) diff --git a/src/plugins/Spider/LibraryBuffer.java b/src/plugins/Spider/LibraryBuffer.java index 151450b..5e9b552 100644 --- a/src/plugins/Spider/LibraryBuffer.java +++ b/src/plugins/Spider/LibraryBuffer.java @@ -161,13 +161,12 @@ private void sendBuffer(int bufferUsageEstimated) { long tStart = System.currentTimeMillis(); try { Logger.normal(this, "Sending buffer of estimated size " + bufferUsageEstimated + " bytes to Library"); - long totalPagesIndexed = spider.getRoot().getPageCount(Status.INDEXED); Bucket bucket = pr.getNode().clientCore.tempBucketFactory.makeBucket(3000000); - writeToPush(totalPagesIndexed, bucket); + writeToPush(totalPagesIndexed(), bucket); innerSend(bucket); Logger.normal(this, "Buffer successfully sent to Library, size = "+bucket.size()); // Not a separate transaction, commit with the index updates. - spider.resetPages(Status.NOT_PUSHED, Status.INDEXED); + spider.donePages(); } catch (IOException ex) { Logger.error(this, "Could not make bucket to transfer buffer", ex); } @@ -186,6 +185,12 @@ private void sendBuffer(int bufferUsageEstimated) { System.out.println("Restored data from last time from "+SAVE_FILE); } } + + private long totalPagesIndexed() { + return spider.getRoot().getPageCount(Status.DONE) + + spider.getRoot().getPageCount(Status.PROCESSED_KSK) + + spider.getRoot().getPageCount(Status.PROCESSED_USK); + } private synchronized Bucket writeToPush(long totalPagesIndexed, Bucket bucket) throws IOException { OutputStream os = bucket.getOutputStream(); @@ -271,7 +276,7 @@ public void terminate() { FileBucket bucket = new FileBucket(SAVE_FILE, false, false, false, false); long totalPagesIndexed; try { - totalPagesIndexed = spider.getRoot().getPageCount(Status.INDEXED); + totalPagesIndexed = totalPagesIndexed(); } catch (Throwable t) { totalPagesIndexed = -1; // FIXME I don't understand why this (ClassNotFoundException) happens, we have not closed the class loader yet. diff --git a/src/plugins/Spider/Spider.java b/src/plugins/Spider/Spider.java index ef0b37e..70b854f 100644 --- a/src/plugins/Spider/Spider.java +++ b/src/plugins/Spider/Spider.java @@ -91,7 +91,7 @@ public class Spider implements FredPlugin, FredPluginThreadless, */ protected Set allowedMIMETypes; - static int dbVersion = 46; + static int dbVersion = 47; static int version = 53; /** We use the standard http://127.0.0.1:8888/ for parsing HTML regardless of what the local @@ -169,15 +169,15 @@ public void setConfig(Config config) { } /** - * Adds the found uri to the list of to-be-retrieved uris.

Every usk uri added as ssk. + * Adds the found uri to the list of to-be-retrieved uris.

* @param uri the new uri that needs to be fetched for further indexing */ - public void queueURI(FreenetURI uri, String comment, boolean force) { + public void queueURI(FreenetURI uri, String comment) { String sURI = uri.toString(); String lowerCaseURI = sURI.toLowerCase(Locale.US); for (String ext : getRoot().getConfig().getBadlistedExtensions()) { if (lowerCaseURI.endsWith(ext)) { - return; // be smart + return; // be smart, don't fetch certain files } } @@ -187,29 +187,10 @@ public void queueURI(FreenetURI uri, String comment, boolean force) { } } - if (uri.isUSK()) { - FreenetURI usk = uri; - try { - if (uri.getSuggestedEdition() < 0) { - uri = uri.setSuggestedEdition((-1) * uri.getSuggestedEdition()); - } - uri = ((USK.create(uri)).getSSK()).getURI(); - } catch (MalformedURLException e) { - } - subscribeUSK(usk, uri); - } - db.beginThreadTransaction(Storage.EXCLUSIVE_TRANSACTION); boolean dbTransactionEnded = false; try { - Page page = getRoot().getPageByURI(uri, true, comment); - if (force && page.getStatus() != Status.QUEUED) { - page.setComment(comment); - if (page.getStatus() != Status.NEW) { - page.setStatus(Status.QUEUED); - } - } - + getRoot().getPageByURI(uri, true, comment); db.endThreadTransaction(); dbTransactionEnded = true; } catch (RuntimeException e) { @@ -284,7 +265,7 @@ private void startFetches() { runningFetch.put(page, getter); } catch (MalformedURLException e) { Logger.error(this, "IMPOSSIBLE-Malformed URI: " + page, e); - page.setStatus(Status.FAILED); + page.setStatus(Status.FATALLY_FAILED); page.setComment("MalformedURLException"); } } @@ -297,16 +278,16 @@ private void startFetches() { db.beginThreadTransaction(Storage.EXCLUSIVE_TRANSACTION); pageInWrongList = null; try { - Iterator it = getRoot().getPages(Status.QUEUED); + Iterator it = getRoot().getPages(Status.FAILED); while (running + toStart.size() < maxParallelRequests && it.hasNext()) { Page page = it.next(); - Logger.debug(this, "Page " + page + " found in QUEUED."); + Logger.debug(this, "Page " + page + " found in FAILED."); // Skip if getting this page already if (runningFetch.containsKey(page)) continue; final Status status = page.getStatus(); - if (status != Status.QUEUED) { + if (status != Status.FAILED) { pageInWrongList = page; continue; } @@ -319,7 +300,7 @@ private void startFetches() { runningFetch.put(page, getter); } catch (MalformedURLException e) { Logger.error(this, "IMPOSSIBLE-Malformed URI: " + page, e); - page.setStatus(Status.FAILED); + page.setStatus(Status.FATALLY_FAILED); page.setComment("MalformedURLException"); } } @@ -366,20 +347,17 @@ private void startSubscribeUSKs() { db.beginThreadTransaction(Storage.EXCLUSIVE_TRANSACTION); try { - Iterator it = getRoot().getPages(Status.INDEXED); + Iterator it = getRoot().getPages(Status.PROCESSED_USK); int started = 0; while (started < maxParallelRequests && it.hasNext()) { Page page = it.next(); - Logger.debug(this, "Page " + page + " found in INDEXED."); - FreenetURI uri; + Logger.debug(this, "Page " + page + " found in PROCESSED_USK."); try { - uri = new FreenetURI(page.getURI()); - if (uri.isSSKForUSK()) { - toSubscribe.put(uri, page); - started++; - } + toSubscribe.put(new FreenetURI(page.getURI()), page); + started++; } catch (MalformedURLException e) { // This could not be converted - ignore. + page.setStatus(Status.FATALLY_FAILED); } } } finally { @@ -410,7 +388,7 @@ private void startSubscribeUSKs() { subscribeUSK(usk.getURI(), uri); db.beginThreadTransaction(Storage.EXCLUSIVE_TRANSACTION); try { - page.setStatus(Status.INDEXED); // Move last. + page.setStatus(Status.DONE); // Move last. } finally { db.endThreadTransaction(); } @@ -593,7 +571,7 @@ protected void onSuccess(FetchResult result, ClientGetter state, Page page) { } catch (UnsafeContentTypeException e) { // wrong mime type - page.setStatus(Status.SUCCEEDED); + page.setStatus(Status.PROCESSED_USK); page.setComment("UnsafeContentTypeException"); db.endThreadTransaction(); dbTransactionEnded = true; @@ -634,7 +612,7 @@ protected void onSuccess(FetchResult result, ClientGetter state, Page page) { // page is now invalidated. page = getRoot().getPageByURI(uri, false, ""); if(page != null) { - page.setStatus(Status.FAILED); + page.setStatus(Status.FATALLY_FAILED); page.setComment("could not complete operation dbTransaction not ended"); } db.endThreadTransaction(); @@ -657,16 +635,16 @@ protected void onFailure(FetchException fe, ClientGetter getter, Page page) { synchronized (page) { if (fe.newURI != null) { // redirect, mark as succeeded - queueURI(fe.newURI, "redirect from " + getter.getURI(), false); - page.setStatus(Status.SUCCEEDED); + queueURI(fe.newURI, "redirect from " + getter.getURI()); + page.setStatus(Status.PROCESSED_USK); page.setComment("Redirected"); } else if (fe.isFatal()) { // too many tries or fatal, mark as failed - page.setStatus(Status.FAILED); + page.setStatus(Status.FATALLY_FAILED); page.setComment("Fatal"); } else { // requeue at back - page.setStatus(Status.QUEUED); + page.setStatus(Status.FAILED); } } db.endThreadTransaction(); @@ -744,7 +722,7 @@ public synchronized void runPlugin(PluginRespirator pr) { FreenetURI[] initialURIs = core.getBookmarkURIs(); for (int i = 0; i < initialURIs.length; i++) { - queueURI(initialURIs[i], "bookmark", false); + queueURI(initialURIs[i], "bookmark"); } librarybuffer = new LibraryBuffer(pr, this); @@ -814,7 +792,7 @@ public void foundURI(FreenetURI uri) { public void foundURI(FreenetURI uri, boolean inline) { if (stopped) throw new RuntimeException("plugin stopping"); if (logDEBUG) Logger.debug(this, "foundURI " + uri + " on " + page); - queueURI(uri, "Added from " + page.getURI(), false); + queueURI(uri, "Added from " + page.getURI()); } protected Integer lastPosition = null; @@ -928,7 +906,7 @@ public void onFoundEdition(long l, USK key, ClientContext context, boolean metad Page page = getRoot().getPageByURI(uri, false, ""); if (page != null) { page.setComment("Replaced by new edition " + key); - page.setStatus(Status.SUCCEEDED); + page.setStatus(Status.PROCESSED_USK); } } } finally { @@ -936,7 +914,7 @@ public void onFoundEdition(long l, USK key, ClientContext context, boolean metad } } FreenetURI uri = key.getURI(); - queueURI(uri, "USK found edition " + uri, true); + queueURI(uri, "USK found edition " + uri); editionsFound.getAndIncrement(); } else { Logger.minor(this, "Not Known Good. Edition search continues for " + key + "."); @@ -970,10 +948,6 @@ private Storage initDB() { PerstRoot root = (PerstRoot) db.getRoot(); if (root == null) PerstRoot.createRoot(db); - else { - // Not working: - // PerstRoot.patchRoot(db); - } return db; } @@ -1031,6 +1005,39 @@ public void resetPages(Status from, Status to) { System.out.println("Reset "+count+" pages status from "+from+" to "+to); } + public void donePages() { + // Not a separate transaction, commit with the index updates. + Status from = Status.NOT_PUSHED; + int count = 0; + Iterator pages = getRoot().getPages(from); + while(pages.hasNext()) { + Page page = pages.next(); + Status to; + FreenetURI uri; + try { + uri = new FreenetURI(page.getURI()); + if (uri.isCHK()) { + to = Status.DONE; + } else if (uri.isKSK()) { + to = Status.PROCESSED_KSK; + } else if (uri.isSSK()) { + to = Status.DONE; + } else if (uri.isUSK()) { + to = Status.PROCESSED_USK; + } else { + Logger.error(this, "Cannot understand the type of the key " + uri); + to = Status.DONE; + } + } catch (MalformedURLException e) { + to = Status.DONE; + } + Logger.debug(this, "Page " + page + " found in " + from + "."); + page.setStatus(to); + count++; + } + Logger.minor(this, "Considered " + count + " pages processed."); + } + public boolean realTimeFlag() { return false; // We definitely want throughput here. } diff --git a/src/plugins/Spider/db/Status.java b/src/plugins/Spider/db/Status.java index f2bc1d2..9c63e03 100644 --- a/src/plugins/Spider/db/Status.java +++ b/src/plugins/Spider/db/Status.java @@ -3,15 +3,21 @@ */ package plugins.Spider.db; +/** + * This enum also control the layout of the database so + * when changing this, be sure to update the dbVersion in + * Spider. + */ public enum Status { - NEW, // Newly found URIs, i.e. queued but never fetched. This puts them priority-wise before QUEUED. - /** For simplicity, running is also mark as QUEUED. + NEW, // Newly found URIs, i.e. never fetched. + NOT_PUSHED, + /** * NOT_PUSHED, when LibraryBuffer is enabled, means we have successfully fetched the page but have not - * yet uploaded the indexed data, so if we have an unclean shutdown we transfer all NOT_PUSHED to QUEUED + * yet uploaded the indexed data, so if we have an unclean shutdown we transfer all NOT_PUSHED to NEW * so they get re-run. */ - QUEUED, - INDEXED, // The information is sent to library. - SUCCEEDED, // The fetch "succeeded" but we will ignore or not include the result. Also when replaced with a new edition. - FAILED, // The fetch "failed" fatally and we will ignore the result. - NOT_PUSHED + DONE, // The information is sent to library or there was no result. There is no more work to do. + PROCESSED_KSK, // The KSK has been sent to the library. We will rescan this later. + PROCESSED_USK, // The USK has been sent to the library. We will rescan this later. + FAILED, + FATALLY_FAILED, // The fetch "failed" fatally and we will ignore the result and never try again. } \ No newline at end of file diff --git a/src/plugins/Spider/web/MainPage.java b/src/plugins/Spider/web/MainPage.java index 5715d66..b895b17 100644 --- a/src/plugins/Spider/web/MainPage.java +++ b/src/plugins/Spider/web/MainPage.java @@ -55,7 +55,7 @@ public void processPostRequest(HTTPRequest request, HTMLNode contentNode) { if (addURI != null && addURI.length() != 0) { try { FreenetURI uri = new FreenetURI(addURI); - spider.queueURI(uri, "manually", true); + spider.queueURI(uri, "manually"); pageMaker.getInfobox("infobox infobox-success", "URI Added", contentNode). addChild("#", "Added " + uri); From d4b6de14625524c1c82ecd6e35b9392a8f0b51fd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Debora=20W=C3=B6pcke?= Date: Sat, 22 Jan 2022 13:37:52 +0100 Subject: [PATCH 21/42] Fix Eclipse warnings --- src/plugins/Spider/Spider.java | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/src/plugins/Spider/Spider.java b/src/plugins/Spider/Spider.java index 70b854f..4fd54e6 100644 --- a/src/plugins/Spider/Spider.java +++ b/src/plugins/Spider/Spider.java @@ -64,6 +64,7 @@ import freenet.pluginmanager.FredPluginVersioned; import freenet.pluginmanager.PluginRespirator; import freenet.support.Logger; +import freenet.support.Logger.LogLevel; import freenet.support.api.Bucket; import freenet.support.io.Closer; import freenet.support.io.NativeThread; @@ -766,7 +767,7 @@ public class PageCallBack implements FoundURICallback{ private String title; private int totalWords; - protected final boolean logDEBUG = Logger.shouldLog(Logger.DEBUG, this); // per instance, allow changing on the fly + protected final boolean logDEBUG = Logger.shouldLog(LogLevel.DEBUG, this); // per instance, allow changing on the fly PageCallBack(Page page) { this.page = page; @@ -960,8 +961,6 @@ protected Page getPageById(long id) { return getRoot().getPageById(id); } - // language for I10N - private LANGUAGE language; @Override public String getString(String key) { @@ -971,7 +970,7 @@ public String getString(String key) { @Override public void setLanguage(LANGUAGE newLanguage) { - language = newLanguage; + Logger.debug(this, "New language set " + newLanguage + " - ignored."); } public PageMaker getPageMaker() { From 9197ce7fad7d002e8027876d4091862d31634f58 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Debora=20W=C3=B6pcke?= Date: Sun, 23 Jan 2022 11:54:58 +0100 Subject: [PATCH 22/42] Fix so that the Status-generated page lists can be saved --- src/plugins/Spider/db/PerstRoot.java | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/src/plugins/Spider/db/PerstRoot.java b/src/plugins/Spider/db/PerstRoot.java index bcddc9b..1a54ce1 100644 --- a/src/plugins/Spider/db/PerstRoot.java +++ b/src/plugins/Spider/db/PerstRoot.java @@ -1,8 +1,6 @@ package plugins.Spider.db; -import java.util.HashMap; import java.util.Iterator; -import java.util.Map; import plugins.Spider.org.garret.perst.FieldIndex; import plugins.Spider.org.garret.perst.Key; @@ -15,7 +13,7 @@ public class PerstRoot extends Persistent { protected FieldIndex idPage; protected FieldIndex uriPage; - Map> statusPages = new HashMap>(); + FieldIndex[] statusPages; private Config config; @@ -35,8 +33,9 @@ public static PerstRoot createRoot(Storage storage) { private void create(Storage storage) { idPage = storage.createFieldIndex(Page.class, "id", true); uriPage = storage.createFieldIndex(Page.class, "uri", true); + statusPages = new FieldIndex[Status.values().length]; for (Status status : Status.values()) { - statusPages.put(status, storage.createFieldIndex(Page.class, "lastChange", true)); + statusPages[status.ordinal()] = storage.createFieldIndex(Page.class, "lastChange", true); } config = new Config(storage); @@ -45,7 +44,7 @@ private void create(Storage storage) { public Page getPageByURI(FreenetURI uri, boolean create, String comment) { idPage.exclusiveLock(); uriPage.exclusiveLock(); - statusPages.get(Status.NEW).exclusiveLock(); + getPageIndex(Status.NEW).exclusiveLock(); try { Page page = uriPage.get(new Key(uri.toString())); @@ -55,12 +54,12 @@ public Page getPageByURI(FreenetURI uri, boolean create, String comment) { idPage.append(page); uriPage.put(page); - statusPages.get(Status.NEW).put(page); + getPageIndex(Status.NEW).put(page); } return page; } finally { - statusPages.get(Status.NEW).unlock(); + getPageIndex(Status.NEW).unlock(); uriPage.unlock(); idPage.unlock(); } @@ -77,7 +76,7 @@ public Page getPageById(long id) { } FieldIndex getPageIndex(Status status) { - return statusPages.get(status); + return statusPages[status.ordinal()]; } public void exclusiveLock(Status status) { From f70c387b75f80551368858936d8ac1dcfb4a3696 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Debora=20W=C3=B6pcke?= Date: Sun, 23 Jan 2022 11:56:41 +0100 Subject: [PATCH 23/42] Never put USKs with search-suggested-edition (-) in the database Also allow 0 library buffer size. --- src/plugins/Spider/Spider.java | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/plugins/Spider/Spider.java b/src/plugins/Spider/Spider.java index 4fd54e6..7284b2b 100644 --- a/src/plugins/Spider/Spider.java +++ b/src/plugins/Spider/Spider.java @@ -188,6 +188,11 @@ public void queueURI(FreenetURI uri, String comment) { } } + // Always add an USK page without the '-' to trigger search of versions. + if (uri.isUSK() && uri.getSuggestedEdition() < 0) { + uri = uri.setSuggestedEdition(-uri.getSuggestedEdition()); + } + db.beginThreadTransaction(Storage.EXCLUSIVE_TRANSACTION); boolean dbTransactionEnded = false; try { @@ -542,7 +547,7 @@ protected void onSuccess(FetchResult result, ClientGetter state, Page page) { boolean dbTransactionEnded = false; db.beginThreadTransaction(Storage.EXCLUSIVE_TRANSACTION); try { - librarybuffer.setBufferSize(getConfig().getNewFormatIndexBufferLimit()*1024*1024); + librarybuffer.setBufferSize(1 + getConfig().getNewFormatIndexBufferLimit()*1024*1024); /* * instead of passing the current object, the pagecallback object for every page is * passed to the content filter this has many benefits to efficiency, and allows us to From 284c124fbc5ce01c2f331fbcdc88b14542169b93 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Debora=20W=C3=B6pcke?= Date: Sun, 23 Jan 2022 12:48:16 +0100 Subject: [PATCH 24/42] Add a new way of persisting pages allowing for USKs --- src/plugins/Spider/index/TermEntry.java | 3 ++- src/plugins/Spider/index/TermEntryWriter.java | 13 ++++++++----- 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/src/plugins/Spider/index/TermEntry.java b/src/plugins/Spider/index/TermEntry.java index 24fcbb7..ce6f9a4 100644 --- a/src/plugins/Spider/index/TermEntry.java +++ b/src/plugins/Spider/index/TermEntry.java @@ -18,7 +18,8 @@ */ abstract public class TermEntry implements Comparable { - final static long serialVersionUID = 0xF23194B7F015560CL; + // final static long serialVersionUID = 0xF23194B7F015560CL; + final static long serialVersionUID2 = 0xF33194B7F015560CL; public enum EntryType { INDEX, TERM, PAGE diff --git a/src/plugins/Spider/index/TermEntryWriter.java b/src/plugins/Spider/index/TermEntryWriter.java index f3064c4..d5ff7c4 100644 --- a/src/plugins/Spider/index/TermEntryWriter.java +++ b/src/plugins/Spider/index/TermEntryWriter.java @@ -31,7 +31,7 @@ public static TermEntryWriter getInstance() { } public void writeObject(TermEntry en, DataOutputStream dos) throws IOException { - dos.writeLong(TermEntry.serialVersionUID); + dos.writeLong(TermEntry.serialVersionUID2); TermEntry.EntryType type = en.entryType(); dos.writeInt(type.ordinal()); dos.writeUTF(en.subj); @@ -39,14 +39,15 @@ public void writeObject(TermEntry en, DataOutputStream dos) throws IOException { switch (type) { case PAGE: TermPageEntry enn = (TermPageEntry)en; - enn.page.writeFullBinaryKeyWithLength(dos); - int size = enn.hasPositions() ? enn.positionsSize() : 0; + dos.writeUTF(enn.page.toString()); if(enn.title == null) - dos.writeInt(size); + dos.writeBoolean(false); else { - dos.writeInt(~size); // invert bits to signify title is set + dos.writeBoolean(true); dos.writeUTF(enn.title); } + int size = enn.hasPositions() ? enn.positionsSize() : 0; + dos.writeInt(size); if(size != 0) { if(enn.hasFragments()) { for(Map.Entry p : enn.positionsMap().entrySet()) { @@ -64,6 +65,8 @@ public void writeObject(TermEntry en, DataOutputStream dos) throws IOException { } } return; + default: + throw new RuntimeException("Not implemented"); } } From c174e566b3ad8160d72066408b8d26873f1bb7db Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Debora=20W=C3=B6pcke?= Date: Sun, 23 Jan 2022 16:15:21 +0100 Subject: [PATCH 25/42] Add debug logging on PageCallBack --- src/plugins/Spider/Spider.java | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/plugins/Spider/Spider.java b/src/plugins/Spider/Spider.java index 7284b2b..00f49a0 100644 --- a/src/plugins/Spider/Spider.java +++ b/src/plugins/Spider/Spider.java @@ -778,6 +778,7 @@ public class PageCallBack implements FoundURICallback{ this.page = page; try { this.uri = new FreenetURI(page.getURI()); + Logger.debug(this, "New PageCallBack for " + this.page + " (" + this.uri + ")."); } catch (MalformedURLException ex) { Logger.error(this, "Error creating uri from '"+page.getURI()+"'", ex); } @@ -858,6 +859,7 @@ void finish() { // Which is equal to log ( total count of files ) - log ( count of files with this word in ) librarybuffer.setRelevance(termPageEntry, ((float)termPageEntry.positionsSize()) / ((float)totalWords)); } + Logger.debug(this, "Finished PageCallBack for " + this.page + " (" + this.uri + ")."); } HashMap tpes = new HashMap(); From 417ef9715ce321e2296271d897639333a5fdbc8b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Debora=20W=C3=B6pcke?= Date: Sun, 13 Feb 2022 16:10:48 +0100 Subject: [PATCH 26/42] Add searching for USKs --- .classpath | 2 +- src/plugins/Spider/Spider.java | 205 ++++++++++---------- src/plugins/Spider/db/Page.java | 2 +- src/plugins/Spider/db/Status.java | 2 +- src/plugins/Spider/index/TermPageEntry.java | 8 +- src/plugins/Spider/web/MainPage.java | 2 +- 6 files changed, 111 insertions(+), 110 deletions(-) diff --git a/.classpath b/.classpath index e79d94c..83bf4ad 100644 --- a/.classpath +++ b/.classpath @@ -4,7 +4,7 @@ - + diff --git a/src/plugins/Spider/Spider.java b/src/plugins/Spider/Spider.java index 00f49a0..2cfa79d 100644 --- a/src/plugins/Spider/Spider.java +++ b/src/plugins/Spider/Spider.java @@ -21,7 +21,6 @@ import java.util.List; import java.util.Locale; import java.util.Map; -import java.util.Map.Entry; import java.util.Set; import java.util.concurrent.ScheduledFuture; import java.util.concurrent.ScheduledThreadPoolExecutor; @@ -80,7 +79,7 @@ * */ public class Spider implements FredPlugin, FredPluginThreadless, - FredPluginVersioned, FredPluginRealVersioned, FredPluginL10n, USKCallback, RequestClient { + FredPluginVersioned, FredPluginRealVersioned, FredPluginL10n, RequestClient { /** Document ID of fetching documents */ protected Map runningFetch = Collections.synchronizedMap(new HashMap()); @@ -121,11 +120,10 @@ public long getRealVersion() { private LibraryBuffer librarybuffer; private final AtomicLong lastRequestFinishedAt = new AtomicLong(); - private final AtomicInteger subscribedToUSKs = new AtomicInteger(); - private final AtomicInteger replacedByUSKs = new AtomicInteger(); + private final AtomicInteger newUSKs = new AtomicInteger(); private final AtomicInteger editionsFound = new AtomicInteger(); - private Map> urisToReplace = Collections.synchronizedMap(new HashMap>()); + private final Set subscribedToUSKs = new HashSet(); public int getLibraryBufferSize() { return librarybuffer.bufferUsageEstimate(); @@ -144,11 +142,11 @@ public long getLastRequestFinishedAt() { } public int getSubscribedToUSKs() { - return subscribedToUSKs.get(); + return subscribedToUSKs.size(); } - public int getReplacedByUSKs() { - return replacedByUSKs.get(); + public int getNewUSKs() { + return newUSKs.get(); } public int getEditionsFound() { @@ -171,6 +169,11 @@ public void setConfig(Config config) { /** * Adds the found uri to the list of to-be-retrieved uris.

+ * + * SSKs are added as their corresponding USK. + * + * Uris already in the database are not added. + * * @param uri the new uri that needs to be fetched for further indexing */ public void queueURI(FreenetURI uri, String comment) { @@ -193,6 +196,11 @@ public void queueURI(FreenetURI uri, String comment) { uri = uri.setSuggestedEdition(-uri.getSuggestedEdition()); } + // Never add an SSK if there could be a corresponding USK + if (uri.isSSKForUSK()) { + uri = uri.uskForSSK(); + } + db.beginThreadTransaction(Storage.EXCLUSIVE_TRANSACTION); boolean dbTransactionEnded = false; try { @@ -210,22 +218,49 @@ public void queueURI(FreenetURI uri, String comment) { } } - private void subscribeUSK(FreenetURI uri, FreenetURI uriToReplace) { + private class SubscribedToUSK implements USKCallback { + private FreenetURI uri; USK usk; - try { - usk = USK.create(uri); - } catch (MalformedURLException e) { - return; + + SubscribedToUSK(FreenetURI theURI) { + uri = theURI; + try { + usk = USK.create(uri); + } catch (MalformedURLException e) { + return; + } + (clientContext.uskManager).subscribe(usk, this, false, Spider.this); + } + + @Override + public void onFoundEdition(long l, USK key, ClientContext context, boolean metadata, + short codec, byte[] data, boolean newKnownGood, boolean newSlot) { + Logger.minor(this, "Found new Edition for " + key + ", newKnownGood=" + newKnownGood + " newSlot=" + newSlot + "."); + newUSKs.getAndIncrement(); + subscribedToUSKs.remove(this); + FreenetURI uri = key.getURI(); + + queueURI(uri, "USK found edition " + uri); } - Set uris = urisToReplace.get(usk); - replacedByUSKs.getAndIncrement(); - if (uris == null) { - subscribedToUSKs.getAndIncrement(); - (clientContext.uskManager).subscribe(usk, this, false, this); - uris = new HashSet(); + + public void unsubscribe() { + (clientContext.uskManager).unsubscribe(usk, this); + subscribedToUSKs.remove(this); } - uris.add(uriToReplace); - urisToReplace.put(usk, uris); + + @Override + public short getPollingPriorityNormal() { + return (short) Math.min(RequestStarter.MINIMUM_FETCHABLE_PRIORITY_CLASS, getRoot().getConfig().getRequestPriority() + 1); + } + + @Override + public short getPollingPriorityProgress() { + return getRoot().getConfig().getRequestPriority(); + } + } + + private void subscribeUSK(FreenetURI uri) { + subscribedToUSKs.add(new SubscribedToUSK(uri)); } /** @@ -328,12 +363,14 @@ private void startFetches() { continue; } ScheduledFuture future = callbackExecutor.scheduleWithFixedDelay(new Runnable() { - long lapsLeft = 10 * 60 * 60; + long lapsLeft = 10 * 60 * 60; // Ten hours @Override public void run() { if (lapsLeft-- <= 0) { g.cancel(clientContext); Logger.minor(this, g + " aborted because of time-out"); + ScheduledFuture f = runningFutures.get(g); + f.cancel(false); } } }, 10, 1, TimeUnit.SECONDS); @@ -342,27 +379,31 @@ public void run() { } /** - * Subscribe to USKs for indexed. + * Subscribe to USKs for PROCESSED_USKs. */ - private void startSubscribeUSKs() { - Map toSubscribe = new HashMap(); + private void subscribeAllUSKs() { synchronized (this) { if (stopped) return; - int maxParallelRequests = 2 * getRoot().getConfig().getMaxParallelRequests(); - db.beginThreadTransaction(Storage.EXCLUSIVE_TRANSACTION); try { Iterator it = getRoot().getPages(Status.PROCESSED_USK); - int started = 0; - while (started < maxParallelRequests && it.hasNext()) { + while (it.hasNext()) { Page page = it.next(); Logger.debug(this, "Page " + page + " found in PROCESSED_USK."); + FreenetURI uri; try { - toSubscribe.put(new FreenetURI(page.getURI()), page); - started++; + uri = new FreenetURI(page.getURI()); } catch (MalformedURLException e) { // This could not be converted - ignore. + Logger.error(this, "USK could not be converted to uri " + page); + page.setStatus(Status.FATALLY_FAILED); + continue; + } + if (uri.isUSK()) { + subscribeUSK(uri); + } else { + Logger.error(this, "USK was not USK " + page); page.setStatus(Status.FATALLY_FAILED); } } @@ -370,36 +411,6 @@ private void startSubscribeUSKs() { db.endThreadTransaction(); } } - - for (Entry entry : toSubscribe.entrySet()) { - FreenetURI uri = entry.getKey(); - Page page = entry.getValue(); - USK usk; - try { - usk = USK.create(uri.uskForSSK()); - } catch (MalformedURLException e1) { - db.beginThreadTransaction(Storage.EXCLUSIVE_TRANSACTION); - try { - page.setComment("MalformedURL in SubscribeUSK"); - } finally { - db.endThreadTransaction(); - } - continue; - } - if (urisToReplace.containsKey(usk)) { - // Everything is subscribed to. - continue; - } - - subscribeUSK(usk.getURI(), uri); - db.beginThreadTransaction(Storage.EXCLUSIVE_TRANSACTION); - try { - page.setStatus(Status.DONE); // Move last. - } finally { - db.endThreadTransaction(); - } - - } } /** @@ -573,11 +584,12 @@ protected void onSuccess(FetchResult result, ClientGetter state, Page page) { } } pageCallBack.finish(); + page.setStatus(Status.NOT_PUSHED); librarybuffer.maybeSend(); } catch (UnsafeContentTypeException e) { // wrong mime type - page.setStatus(Status.PROCESSED_USK); + page.setStatus(Status.FATALLY_FAILED); page.setComment("UnsafeContentTypeException"); db.endThreadTransaction(); dbTransactionEnded = true; @@ -594,7 +606,6 @@ protected void onSuccess(FetchResult result, ClientGetter state, Page page) { return; } - page.setStatus(Status.NOT_PUSHED); db.endThreadTransaction(); dbTransactionEnded = true; @@ -640,14 +651,30 @@ protected void onFailure(FetchException fe, ClientGetter getter, Page page) { try { synchronized (page) { if (fe.newURI != null) { + // Cases are noticed when the USK is redirected, + // because of the missing meta-string, to its SSK. + // That is not good from the purpose of maintaining the USK + // in the index. + FreenetURI newURI = fe.newURI; + if (fe.mode == FetchException.FetchExceptionMode.NOT_ENOUGH_PATH_COMPONENTS) { + try { + FreenetURI uri; + uri = new FreenetURI(page.getURI()); + if (uri.isUSK() && !uri.hasMetaStrings()) { + newURI = uri.pushMetaString(""); + } + } catch (MalformedURLException e) { + // Ignore problems in the URI of the page. + } + } // redirect, mark as succeeded - queueURI(fe.newURI, "redirect from " + getter.getURI()); - page.setStatus(Status.PROCESSED_USK); - page.setComment("Redirected"); + queueURI(newURI, "redirect from " + getter.getURI()); + page.setStatus(Status.DONE); + page.setComment("Redirected to " + newURI + " because of " + fe.getMode()); } else if (fe.isFatal()) { // too many tries or fatal, mark as failed page.setStatus(Status.FATALLY_FAILED); - page.setComment("Fatal"); + page.setComment("Fatal: " + fe.getMode()); } else { // requeue at back page.setStatus(Status.FAILED); @@ -683,6 +710,9 @@ public void terminate(){ Logger.minor(this, "Canceling request" + getter); getter.cancel(clientContext); } + for (SubscribedToUSK stu : new HashSet(subscribedToUSKs)) { + stu.unsubscribe(); + } runningFetch.clear(); callbackExecutor.shutdownNow(); } @@ -745,17 +775,17 @@ public void run() { } }, 30, 30, TimeUnit.SECONDS); - callbackExecutor.scheduleWithFixedDelay(new Runnable() { + callbackExecutor.schedule(new Runnable() { @Override public void run() { try { - startSubscribeUSKs(); + subscribeAllUSKs(); } catch (Throwable e) { Logger.error(this, "startSubscribeUSKs throws", e); } } - }, 130, 60, TimeUnit.SECONDS); + }, 10L, TimeUnit.SECONDS); } private WebInterface webInterface; @@ -901,40 +931,6 @@ public void onFinishedPage() { } } - @Override - public void onFoundEdition(long l, USK key, ClientContext context, boolean metadata, - short codec, byte[] data, boolean newKnownGood, boolean newSlotToo) { - if (newKnownGood) { - Logger.minor(this, "Known Good. Found new Edition for " + key + "."); - Set uris = urisToReplace.remove(key); - if (uris != null) { - db.beginThreadTransaction(Storage.EXCLUSIVE_TRANSACTION); - try { - for (FreenetURI uri : uris) { - Page page = getRoot().getPageByURI(uri, false, ""); - if (page != null) { - page.setComment("Replaced by new edition " + key); - page.setStatus(Status.PROCESSED_USK); - } - } - } finally { - db.endThreadTransaction(); - } - } - FreenetURI uri = key.getURI(); - queueURI(uri, "USK found edition " + uri); - editionsFound.getAndIncrement(); - } else { - Logger.minor(this, "Not Known Good. Edition search continues for " + key + "."); - } - } - - @Override - public short getPollingPriorityNormal() { - return (short) Math.min(RequestStarter.MINIMUM_FETCHABLE_PRIORITY_CLASS, getRoot().getConfig().getRequestPriority() + 1); - } - - @Override public short getPollingPriorityProgress() { return getRoot().getConfig().getRequestPriority(); } @@ -1030,6 +1026,7 @@ public void donePages() { to = Status.DONE; } else if (uri.isUSK()) { to = Status.PROCESSED_USK; + subscribeUSK(uri); } else { Logger.error(this, "Cannot understand the type of the key " + uri); to = Status.DONE; diff --git a/src/plugins/Spider/db/Page.java b/src/plugins/Spider/db/Page.java index 473fbae..0ae645a 100644 --- a/src/plugins/Spider/db/Page.java +++ b/src/plugins/Spider/db/Page.java @@ -50,7 +50,7 @@ public Status getStatus() { } public synchronized void setComment(String comment) { - Logger.debug(this, "New comment for " + this); + Logger.debug(this, "New comment " + comment + " for " + this); preModify(); this.comment = comment; postModify(); diff --git a/src/plugins/Spider/db/Status.java b/src/plugins/Spider/db/Status.java index 9c63e03..c889fea 100644 --- a/src/plugins/Spider/db/Status.java +++ b/src/plugins/Spider/db/Status.java @@ -17,7 +17,7 @@ public enum Status { * so they get re-run. */ DONE, // The information is sent to library or there was no result. There is no more work to do. PROCESSED_KSK, // The KSK has been sent to the library. We will rescan this later. - PROCESSED_USK, // The USK has been sent to the library. We will rescan this later. + PROCESSED_USK, // The USK has been sent to the library. Subscriptions are set up for these. FAILED, FATALLY_FAILED, // The fetch "failed" fatally and we will ignore the result and never try again. } \ No newline at end of file diff --git a/src/plugins/Spider/index/TermPageEntry.java b/src/plugins/Spider/index/TermPageEntry.java index 0aeb8de..eefa05f 100644 --- a/src/plugins/Spider/index/TermPageEntry.java +++ b/src/plugins/Spider/index/TermPageEntry.java @@ -63,7 +63,9 @@ public TermPageEntry(String s, float r, FreenetURI u, String t, Map pos, Map Date: Wed, 16 Mar 2022 20:02:29 +0100 Subject: [PATCH 27/42] Fill up the queue more often --- src/plugins/Spider/Spider.java | 153 ++++++++++++++++++++++----------- 1 file changed, 101 insertions(+), 52 deletions(-) diff --git a/src/plugins/Spider/Spider.java b/src/plugins/Spider/Spider.java index 2cfa79d..2cd772b 100644 --- a/src/plugins/Spider/Spider.java +++ b/src/plugins/Spider/Spider.java @@ -15,9 +15,12 @@ import java.net.URI; import java.util.ArrayList; import java.util.Collections; +import java.util.Date; +import java.util.Deque; import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; +import java.util.LinkedList; import java.util.List; import java.util.Locale; import java.util.Map; @@ -125,6 +128,9 @@ public long getRealVersion() { private final Set subscribedToUSKs = new HashSet(); + private BulkPageIterator newPages; + private BulkPageIterator failedPages; + public int getLibraryBufferSize() { return librarybuffer.bufferUsageEstimate(); } @@ -263,13 +269,78 @@ private void subscribeUSK(FreenetURI uri) { subscribedToUSKs.add(new SubscribedToUSK(uri)); } + /** + * Fetches pages from a queue, many at the time to avoid locking + * the database on every fetch. + */ + class BulkPageIterator implements Iterator { + private Status queue; + private Deque list = new LinkedList(); + private int BULK_FETCH_SIZE = 1000; + private long TIME_TO_DEFER_DATABASE_READ = TimeUnit.SECONDS.toMillis(30); + private Date lastPoll = new Date(); + + BulkPageIterator(Status status) { + queue = status; + } + + /** + * Fills the cache with pages. + * If the consumer went through the cache to quickly, don't + * fill it, emulating an empty iterator. This addresses the + * case when all the found pages are in progress. It also sets + * a cap on the amount of pages processed per time unit. + * @param extraFetches is amount of pages to fetch on top of + * BULK_FETCH_SIZE. + */ + private void fill(int extraFetches) { + Date now = new Date(); + if (list.isEmpty() && now.after(new Date(lastPoll.getTime() + TIME_TO_DEFER_DATABASE_READ))) { + lastPoll = now; + db.beginThreadTransaction(Storage.EXCLUSIVE_TRANSACTION); + try { + Iterator it = getRoot().getPages(queue); + int i = 0; + while (it.hasNext()) { + list.offer(it.next()); + if (++i > BULK_FETCH_SIZE + extraFetches) { + break; + } + } + } finally { + db.endThreadTransaction(); + } + } + } + + public boolean hasNext(int extraFetches) { + fill(extraFetches); + return !list.isEmpty(); + } + + @Override + public boolean hasNext() { + fill(0); + return !list.isEmpty(); + } + + @Override + public Page next() { + return list.poll(); + } + } + /** * Start requests from new and queued. */ private void startFetches() { ArrayList toStart = null; synchronized (this) { - if (stopped) return; + if (stopped) { + newPages = null; + failedPages = null; + return; + } synchronized (runningFetch) { int maxParallelRequests = getRoot().getConfig().getMaxParallelRequests(); @@ -279,59 +350,40 @@ private void startFetches() { return; } - // Prepare to start - toStart = new ArrayList(maxParallelRequests - running); - Page pageInWrongList = null; - db.beginThreadTransaction(Storage.EXCLUSIVE_TRANSACTION); - try { - Iterator it = getRoot().getPages(Status.NEW); + if (newPages == null) { + newPages = new BulkPageIterator(Status.NEW); + } - while (running + toStart.size() < maxParallelRequests && it.hasNext()) { - Page page = it.next(); - Logger.debug(this, "Page " + page + " found in NEW."); - // Skip if getting this page already - if (runningFetch.containsKey(page)) continue; - - final Status status = page.getStatus(); - if (status != Status.NEW) { - pageInWrongList = page; - continue; - } + toStart = new ArrayList(maxParallelRequests - running); + while (running + toStart.size() < maxParallelRequests && newPages.hasNext(maxParallelRequests)) { + Page page = newPages.next(); + Logger.debug(this, "Page " + page + " found in NEW."); + // Skip if getting this page already + if (runningFetch.containsKey(page)) continue; - try { - ClientGetter getter = makeGetter(page); + try { + ClientGetter getter = makeGetter(page); - Logger.minor(this, "Starting new " + getter + " " + page); - toStart.add(getter); - runningFetch.put(page, getter); - } catch (MalformedURLException e) { - Logger.error(this, "IMPOSSIBLE-Malformed URI: " + page, e); - page.setStatus(Status.FATALLY_FAILED); - page.setComment("MalformedURLException"); - } - } - } finally { - if (pageInWrongList != null) { - pageInWrongList.pageFoundInWrongList(); + Logger.minor(this, "Starting new " + getter + " " + page); + toStart.add(getter); + runningFetch.put(page, getter); + } catch (MalformedURLException e) { + Logger.error(this, "IMPOSSIBLE-Malformed URI: " + page, e); + page.setStatus(Status.FATALLY_FAILED); + page.setComment("MalformedURLException"); } - db.endThreadTransaction(); } - db.beginThreadTransaction(Storage.EXCLUSIVE_TRANSACTION); - pageInWrongList = null; - try { - Iterator it = getRoot().getPages(Status.FAILED); - while (running + toStart.size() < maxParallelRequests && it.hasNext()) { - Page page = it.next(); + if (running + toStart.size() < maxParallelRequests) { + if (failedPages == null) { + failedPages = new BulkPageIterator(Status.FAILED); + } + + while (running + toStart.size() < maxParallelRequests && failedPages.hasNext(maxParallelRequests)) { + Page page = failedPages.next(); Logger.debug(this, "Page " + page + " found in FAILED."); - // Skip if getting this page already + // Skip if getting this page already if (runningFetch.containsKey(page)) continue; - - final Status status = page.getStatus(); - if (status != Status.FAILED) { - pageInWrongList = page; - continue; - } try { ClientGetter getter = makeGetter(page); @@ -345,11 +397,8 @@ private void startFetches() { page.setComment("MalformedURLException"); } } - } finally { - if (pageInWrongList != null) { - pageInWrongList.pageFoundInWrongList(); - } - db.endThreadTransaction(); + } else { + failedPages = null; } } } @@ -774,7 +823,7 @@ public void run() { } } - }, 30, 30, TimeUnit.SECONDS); + }, 30L, 1L, TimeUnit.SECONDS); callbackExecutor.schedule(new Runnable() { @Override public void run() { From b52531513a50af51bba13da2bf0e34d7837c9f4d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Debora=20W=C3=B6pcke?= Date: Sun, 20 Mar 2022 13:08:31 +0100 Subject: [PATCH 28/42] Remove printout to stderr --- src/plugins/Spider/db/Page.java | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/plugins/Spider/db/Page.java b/src/plugins/Spider/db/Page.java index 0ae645a..980950f 100644 --- a/src/plugins/Spider/db/Page.java +++ b/src/plugins/Spider/db/Page.java @@ -128,8 +128,6 @@ private void preModify() { if(e.getErrorCode() == StorageError.KEY_NOT_FOUND) { // No serious consequences, so just log it, rather than killing the whole thing. Logger.error(this, "Page: Key not found in index: "+this, e); - System.err.println("Page: Key not found in index: "+this); - e.printStackTrace(); } else { Logger.error(this, "remove from index " + status + " failed", e); throw e; From 4742f0ed52812c3f40ad9eb683d4f9c61219cace Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Debora=20W=C3=B6pcke?= Date: Sun, 20 Mar 2022 13:58:07 +0100 Subject: [PATCH 29/42] Allow info from library to show up on spider page --- src/plugins/Spider/web/MainPage.java | 39 ++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) diff --git a/src/plugins/Spider/web/MainPage.java b/src/plugins/Spider/web/MainPage.java index 8ed3e95..976fe4a 100644 --- a/src/plugins/Spider/web/MainPage.java +++ b/src/plugins/Spider/web/MainPage.java @@ -4,6 +4,11 @@ */ package plugins.Spider.web; +import java.io.BufferedReader; +import java.io.File; +import java.io.FileNotFoundException; +import java.io.FileReader; +import java.io.IOException; import java.util.ArrayList; import java.util.Iterator; import java.util.List; @@ -128,6 +133,40 @@ public void writeContent(HTTPRequest request, HTMLNode contentNode) { addForm.addChild("label", "for", "addURI", "Add URI:"); addForm.addChild("input", new String[] { "name", "style" }, new String[] { "addURI", "width: 20em;" }); addForm.addChild("input", new String[] { "type", "value" }, new String[] { "submit", "Add" }); + mainContent.addChild("p"); + final File file = new File(".", "library.info"); + FileReader fr = null; + BufferedReader br = null; + try { + fr = new FileReader(file); + br = new BufferedReader(fr); + String line; + while ((line = br.readLine()) != null) { + mainContent.addChild("#", line); + mainContent.addChild("br"); + } + br.close(); + } catch (FileNotFoundException e) { + // There is no such file. That is fine. + } catch (IOException e) { + // We suddenly couldn't read this file. Strange problem. + throw new RuntimeException(e); + } finally { + if (br != null) { + try { + br.close(); + } catch (IOException e) { + // Ignore. + } + } + if (fr != null) { + try { + fr.close(); + } catch (IOException e) { + // Ignore. + } + } + } InfoboxNode running = pageMaker.getInfobox("Running URI"); HTMLNode runningBox = running.outer; From 8f64caabc2946317e0534d1425a30ebf31e13983 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Debora=20W=C3=B6pcke?= Date: Sun, 3 Apr 2022 14:36:55 +0200 Subject: [PATCH 30/42] Improved the handling of USKs Each USK is now stored once in the database and not once per edition. --- src/plugins/Spider/Spider.java | 227 +++++++++++++-------------- src/plugins/Spider/db/Page.java | 94 ++++++----- src/plugins/Spider/db/PerstRoot.java | 17 ++ src/plugins/Spider/db/Status.java | 1 + src/plugins/Spider/web/MainPage.java | 33 ++-- 5 files changed, 200 insertions(+), 172 deletions(-) diff --git a/src/plugins/Spider/Spider.java b/src/plugins/Spider/Spider.java index 2cd772b..a0424a6 100644 --- a/src/plugins/Spider/Spider.java +++ b/src/plugins/Spider/Spider.java @@ -85,7 +85,7 @@ public class Spider implements FredPlugin, FredPluginThreadless, FredPluginVersioned, FredPluginRealVersioned, FredPluginL10n, RequestClient { /** Document ID of fetching documents */ - protected Map runningFetch = Collections.synchronizedMap(new HashMap()); + protected Map runningFetch = Collections.synchronizedMap(new HashMap()); private Map> runningFutures = Collections.synchronizedMap(new HashMap>()); @@ -128,8 +128,7 @@ public long getRealVersion() { private final Set subscribedToUSKs = new HashSet(); - private BulkPageIterator newPages; - private BulkPageIterator failedPages; + private BulkPageIterator[] bulkPageIterators = new BulkPageIterator[Status.values().length]; public int getLibraryBufferSize() { return librarybuffer.bufferUsageEstimate(); @@ -178,12 +177,16 @@ public void setConfig(Config config) { * * SSKs are added as their corresponding USK. * - * Uris already in the database are not added. + * Uris already in the database are not added. New Uris are put in NEW. + * + * USKs in the database but with new edition are moved to NEW_EDITION. * * @param uri the new uri that needs to be fetched for further indexing */ public void queueURI(FreenetURI uri, String comment) { String sURI = uri.toString(); + final long NO_USK = -1L; + long edition = NO_USK; String lowerCaseURI = sURI.toLowerCase(Locale.US); for (String ext : getRoot().getConfig().getBadlistedExtensions()) { if (lowerCaseURI.endsWith(ext)) { @@ -207,10 +210,21 @@ public void queueURI(FreenetURI uri, String comment) { uri = uri.uskForSSK(); } + if (uri.isUSK()) { + edition = uri.getSuggestedEdition(); + uri = uri.setSuggestedEdition(0L); + } + db.beginThreadTransaction(Storage.EXCLUSIVE_TRANSACTION); boolean dbTransactionEnded = false; try { - getRoot().getPageByURI(uri, true, comment); + Page page = getRoot().getPageByURI(uri, true, comment); + if (edition != NO_USK) { + final long oldEdition = page.getEdition(); + if (edition > oldEdition) { + page.setStatus(edition, Status.NEW_EDITION, "New edition replacing " + oldEdition); + } + } db.endThreadTransaction(); dbTransactionEnded = true; } catch (RuntimeException e) { @@ -337,8 +351,9 @@ private void startFetches() { ArrayList toStart = null; synchronized (this) { if (stopped) { - newPages = null; - failedPages = null; + bulkPageIterators[Status.NEW.ordinal()] = null; + bulkPageIterators[Status.NEW_EDITION.ordinal()] = null; + bulkPageIterators[Status.FAILED.ordinal()] = null; return; } @@ -350,55 +365,32 @@ private void startFetches() { return; } - if (newPages == null) { - newPages = new BulkPageIterator(Status.NEW); - } - toStart = new ArrayList(maxParallelRequests - running); - while (running + toStart.size() < maxParallelRequests && newPages.hasNext(maxParallelRequests)) { - Page page = newPages.next(); - Logger.debug(this, "Page " + page + " found in NEW."); - // Skip if getting this page already - if (runningFetch.containsKey(page)) continue; - - try { - ClientGetter getter = makeGetter(page); - - Logger.minor(this, "Starting new " + getter + " " + page); - toStart.add(getter); - runningFetch.put(page, getter); - } catch (MalformedURLException e) { - Logger.error(this, "IMPOSSIBLE-Malformed URI: " + page, e); - page.setStatus(Status.FATALLY_FAILED); - page.setComment("MalformedURLException"); - } - } - if (running + toStart.size() < maxParallelRequests) { - if (failedPages == null) { - failedPages = new BulkPageIterator(Status.FAILED); + Status[] statuses = {Status.NEW, Status.NEW_EDITION, Status.FAILED}; + for (Status status : statuses) { + if (bulkPageIterators[status.ordinal()] == null) { + bulkPageIterators[status.ordinal()] = new BulkPageIterator(status); } - while (running + toStart.size() < maxParallelRequests && failedPages.hasNext(maxParallelRequests)) { - Page page = failedPages.next(); - Logger.debug(this, "Page " + page + " found in FAILED."); - // Skip if getting this page already - if (runningFetch.containsKey(page)) continue; + while (running + toStart.size() < maxParallelRequests && + bulkPageIterators[status.ordinal()].hasNext(maxParallelRequests)) { + Page page = bulkPageIterators[status.ordinal()].next(); + Logger.debug(this, "Page " + page + " found in " + status + "."); + // Skip if getting this page already + if (runningFetch.containsKey(page.getURI())) continue; try { ClientGetter getter = makeGetter(page); - Logger.minor(this, "Starting queued " + getter + " " + page); + Logger.minor(this, "Starting new " + getter + " " + page); toStart.add(getter); - runningFetch.put(page, getter); + runningFetch.put(page.getURI(), getter); } catch (MalformedURLException e) { Logger.error(this, "IMPOSSIBLE-Malformed URI: " + page, e); - page.setStatus(Status.FATALLY_FAILED); - page.setComment("MalformedURLException"); + page.setStatus(Status.FATALLY_FAILED, "MalformedURLException"); } } - } else { - failedPages = null; } } } @@ -466,30 +458,24 @@ private void subscribeAllUSKs() { * Callback for fetching the pages */ private class ClientGetterCallback implements ClientGetCallback { - final Page page; - - public ClientGetterCallback(Page page) { - this.page = page; - } - @Override public void onFailure(FetchException e, ClientGetter state) { - Logger.minor(this, "onFailure: " + page + " (q:" + callbackExecutor.getQueue().size() + ")"); + Logger.minor(this, "onFailure: " + state.getURI() + " (q:" + callbackExecutor.getQueue().size() + ")"); removeFuture(state); if (stopped) return; - callbackExecutor.execute(new OnFailureCallback(e, state, page)); + callbackExecutor.execute(new OnFailureCallback(e, state)); } @Override public void onSuccess(final FetchResult result, final ClientGetter state) { - Logger.minor(this, "onSuccess: " + page + " (q:" + callbackExecutor.getQueue().size() + ")"); + Logger.minor(this, "onSuccess: " + state.getURI() + " (q:" + callbackExecutor.getQueue().size() + ")"); removeFuture(state); if (stopped) return; - callbackExecutor.execute(new OnSuccessCallback(result, state, page)); + callbackExecutor.execute(new OnSuccessCallback(result, state)); } private void removeFuture(ClientGetter getter) { @@ -499,10 +485,6 @@ private void removeFuture(ClientGetter getter) { } } - public String toString() { - return super.toString() + ":" + page; - } - @Override public void onResume(ClientContext context) throws ResumeFailedException { } @@ -514,8 +496,12 @@ public RequestClient getRequestClient() { } private ClientGetter makeGetter(Page page) throws MalformedURLException { - ClientGetter getter = new ClientGetter(new ClientGetterCallback(page), - new FreenetURI(page.getURI()), ctx, + FreenetURI uri = new FreenetURI(page.getURI()); + if (uri.isUSK()) { + uri = uri.setSuggestedEdition(page.getEdition()); + } + ClientGetter getter = new ClientGetter(new ClientGetterCallback(), + uri, ctx, getPollingPriorityProgress(), null); return getter; } @@ -523,16 +509,14 @@ private ClientGetter makeGetter(Page page) throws MalformedURLException { protected class OnFailureCallback implements Runnable { private FetchException e; private ClientGetter state; - private Page page; - OnFailureCallback(FetchException e, ClientGetter state, Page page) { + OnFailureCallback(FetchException e, ClientGetter state) { this.e = e; this.state = state; - this.page = page; } public void run() { - onFailure(e, state, page); + onFailure(e, state); } } @@ -542,16 +526,14 @@ public void run() { protected class OnSuccessCallback implements Runnable { private FetchResult result; private ClientGetter state; - private Page page; - OnSuccessCallback(FetchResult result, ClientGetter state, Page page) { + OnSuccessCallback(FetchResult result, ClientGetter state) { this.result = result; this.state = state; - this.page = page; } public void run() { - onSuccess(result, state, page); + onSuccess(result, state); } } @@ -590,23 +572,33 @@ public Thread newThread(Runnable r) { * * @param result * @param state - * @param page */ // single threaded - protected void onSuccess(FetchResult result, ClientGetter state, Page page) { + protected void onSuccess(FetchResult result, ClientGetter state) { synchronized (this) { if (stopped) return; } - lastRequestFinishedAt.set(currentTimeMillis()); FreenetURI uri = state.getURI(); + FreenetURI dbURI = uri; + if (uri.isUSK() ) { + dbURI = uri.setSuggestedEdition(0L); + } + + lastRequestFinishedAt.set(currentTimeMillis()); ClientMetadata cm = result.getMetadata(); Bucket data = result.asBucket(); String mimeType = cm.getMIMEType(); boolean dbTransactionEnded = false; db.beginThreadTransaction(Storage.EXCLUSIVE_TRANSACTION); + Page page = null; try { + page = getRoot().getPageByURI(dbURI); + if (page == null) { + Logger.error(this, "Cannot find page " + dbURI); + return; + } librarybuffer.setBufferSize(1 + getConfig().getNewFormatIndexBufferLimit()*1024*1024); /* * instead of passing the current object, the pagecallback object for every page is @@ -638,8 +630,7 @@ protected void onSuccess(FetchResult result, ClientGetter state, Page page) { } catch (UnsafeContentTypeException e) { // wrong mime type - page.setStatus(Status.FATALLY_FAILED); - page.setComment("UnsafeContentTypeException"); + page.setStatus(Status.FATALLY_FAILED, "UnsafeContentTypeException"); db.endThreadTransaction(); dbTransactionEnded = true; @@ -651,7 +642,7 @@ protected void onSuccess(FetchResult result, ClientGetter state, Page page) { return; } catch (Exception e) { // we have lots of invalid html on net - just normal, not error - Logger.normal(this, "exception on content filter for " + page, e); + Logger.normal(this, "exception on content filter for " + uri, e); return; } @@ -668,7 +659,9 @@ protected void onSuccess(FetchResult result, ClientGetter state, Page page) { data.free(); synchronized (this) { - runningFetch.remove(page); + if (page != null) { + runningFetch.remove(page.getURI()); + } } } finally { if (!dbTransactionEnded) { @@ -676,10 +669,9 @@ protected void onSuccess(FetchResult result, ClientGetter state, Page page) { db.rollbackThreadTransaction(); db.beginThreadTransaction(Storage.EXCLUSIVE_TRANSACTION); // page is now invalidated. - page = getRoot().getPageByURI(uri, false, ""); - if(page != null) { - page.setStatus(Status.FATALLY_FAILED); - page.setComment("could not complete operation dbTransaction not ended"); + if (page != null) { + page.setStatus(Status.FATALLY_FAILED, + "could not complete operation dbTransaction not ended"); } db.endThreadTransaction(); } @@ -687,47 +679,48 @@ protected void onSuccess(FetchResult result, ClientGetter state, Page page) { } } - protected void onFailure(FetchException fe, ClientGetter getter, Page page) { - Logger.minor(this, "Failed: " + page + " : " + getter, fe); + protected void onFailure(FetchException fe, ClientGetter getter) { + FreenetURI uri = getter.getURI(); + Logger.minor(this, "Failed: " + uri + " : " + getter, fe); synchronized (this) { if (stopped) return; } + FreenetURI dbURI = uri; + if (uri.isUSK() ) { + dbURI = uri.setSuggestedEdition(0L); + } + lastRequestFinishedAt.set(currentTimeMillis()); boolean dbTransactionEnded = false; db.beginThreadTransaction(Storage.EXCLUSIVE_TRANSACTION); + Page page = null; try { - synchronized (page) { - if (fe.newURI != null) { - // Cases are noticed when the USK is redirected, - // because of the missing meta-string, to its SSK. - // That is not good from the purpose of maintaining the USK - // in the index. - FreenetURI newURI = fe.newURI; - if (fe.mode == FetchException.FetchExceptionMode.NOT_ENOUGH_PATH_COMPONENTS) { - try { - FreenetURI uri; - uri = new FreenetURI(page.getURI()); - if (uri.isUSK() && !uri.hasMetaStrings()) { - newURI = uri.pushMetaString(""); - } - } catch (MalformedURLException e) { - // Ignore problems in the URI of the page. - } + page = getRoot().getPageByURI(dbURI); + if (page == null) { + return; + } + if (fe.newURI != null) { + // Cases are noticed when the USK is redirected, + // because of the missing meta-string, to its SSK. + // That is not good from the purpose of maintaining the USK + // in the index. + FreenetURI newURI = fe.newURI; + if (fe.mode == FetchException.FetchExceptionMode.NOT_ENOUGH_PATH_COMPONENTS) { + if (uri.isUSK() && !uri.hasMetaStrings()) { + newURI = uri.pushMetaString(""); } - // redirect, mark as succeeded - queueURI(newURI, "redirect from " + getter.getURI()); - page.setStatus(Status.DONE); - page.setComment("Redirected to " + newURI + " because of " + fe.getMode()); - } else if (fe.isFatal()) { - // too many tries or fatal, mark as failed - page.setStatus(Status.FATALLY_FAILED); - page.setComment("Fatal: " + fe.getMode()); - } else { - // requeue at back - page.setStatus(Status.FAILED); } + // redirect, mark as succeeded + queueURI(newURI, "redirect from " + getter.getURI()); + page.setStatus(Status.DONE, "Redirected to " + newURI + " because of " + fe.getMode()); + } else if (fe.isFatal()) { + // too many tries or fatal, mark as failed + page.setStatus(Status.FATALLY_FAILED, "Fatal: " + fe.getMode()); + } else { + // requeue at back + page.setStatus(Status.FAILED); } db.endThreadTransaction(); dbTransactionEnded = true; @@ -735,7 +728,9 @@ protected void onFailure(FetchException fe, ClientGetter getter, Page page) { Logger.error(this, "Unexcepected exception in onFailure(): " + e, e); throw new RuntimeException("Unexcepected exception in onFailure()", e); } finally { - runningFetch.remove(page); + if (page != null) { + runningFetch.remove(page.getURI()); + } if (!dbTransactionEnded) { Logger.minor(this, "rollback transaction", new Exception("debug")); db.rollbackThreadTransaction(); @@ -754,7 +749,7 @@ public void terminate(){ synchronized (this) { stopped = true; - for (Map.Entry me : runningFetch.entrySet()) { + for (Map.Entry me : runningFetch.entrySet()) { ClientGetter getter = me.getValue(); Logger.minor(this, "Canceling request" + getter); getter.cancel(clientContext); @@ -856,7 +851,11 @@ public class PageCallBack implements FoundURICallback{ PageCallBack(Page page) { this.page = page; try { - this.uri = new FreenetURI(page.getURI()); + uri = new FreenetURI(page.getURI()); + final long edition = page.getEdition(); + if (edition != 0L) { + uri = uri.setSuggestedEdition(edition); + } Logger.debug(this, "New PageCallBack for " + this.page + " (" + this.uri + ")."); } catch (MalformedURLException ex) { Logger.error(this, "Error creating uri from '"+page.getURI()+"'", ex); @@ -878,7 +877,7 @@ public void foundURI(FreenetURI uri) { public void foundURI(FreenetURI uri, boolean inline) { if (stopped) throw new RuntimeException("plugin stopping"); if (logDEBUG) Logger.debug(this, "foundURI " + uri + " on " + page); - queueURI(uri, "Added from " + page.getURI()); + queueURI(uri, "Added from " + this.uri); } protected Integer lastPosition = null; @@ -1029,9 +1028,9 @@ public PageMaker getPageMaker() { return pageMaker; } - public List getRunningFetch() { + public List getRunningFetch() { synchronized (runningFetch) { - return new ArrayList(runningFetch.keySet()); + return new ArrayList(runningFetch.keySet()); } } diff --git a/src/plugins/Spider/db/Page.java b/src/plugins/Spider/db/Page.java index 980950f..3bd4945 100644 --- a/src/plugins/Spider/db/Page.java +++ b/src/plugins/Spider/db/Page.java @@ -3,7 +3,9 @@ */ package plugins.Spider.db; +import java.util.ArrayList; import java.util.Date; +import java.util.List; import freenet.support.Logger; import plugins.Spider.org.garret.perst.FieldIndex; @@ -17,6 +19,8 @@ public class Page extends Persistent implements Comparable { protected long id; /** URI of the page */ protected String uri; + /** suggestedEdition of the page */ + protected long edition; /** Title */ protected String pageTitle; /** Status */ @@ -29,31 +33,63 @@ public class Page extends Persistent implements Comparable { public Page() { } - Page(String uri, String comment, Storage storage) { + Page(String uri, long edition, String comment, Storage storage) { this.uri = uri; + this.edition = edition; this.comment = comment; this.status = Status.NEW; this.lastChange = System.currentTimeMillis(); storage.makePersistent(this); } - - public synchronized void setStatus(Status status) { - Logger.debug(this, "New status " + status + " for " + this); + + Page(String uri, String comment, Storage storage) { + this(uri, 0L, comment, storage); + } + + public long getEdition() { + return edition; + } + + public synchronized void setStatus(long edition, Status status, String comment) { + List mess = new ArrayList(); + if (edition != 0L) { + mess.add("edition " + edition); + } + if (status != null) { + mess.add("status " + status); + } + if (comment != null) { + mess.add("comment \"" + comment + "\""); + } + Logger.debug(this, "New " + String.join(", ", mess) + " for " + this); preModify(); - this.status = status; + if (edition != 0L) { + this.edition = edition; + } + if (status != null) { + this.status = status; + } + if (comment != null) { + this.comment = comment; + } postModify(); } + public synchronized void setStatus(Status status) { + setStatus(status, null); + } + + public synchronized void setStatus(Status status, String comment) { + setStatus(0, status, comment); + } + public Status getStatus() { return status; } public synchronized void setComment(String comment) { - Logger.debug(this, "New comment " + comment + " for " + this); - preModify(); - this.comment = comment; - postModify(); + setStatus(0, null, comment); } public String getComment() { @@ -106,7 +142,7 @@ public boolean equals(Object obj) { @Override public String toString() { - return "[PAGE: id=" + id + ", title=" + pageTitle + ", uri=" + uri + ", status=" + status + ", comment=" + return "[PAGE: id=" + id + ", title=" + pageTitle + ", uri=" + uri + ", edition=" + edition + " status=" + status + ", comment=" + comment + "]"; } @@ -156,42 +192,4 @@ private void postModify() { } } } - - /** - * Called when we find the page in the wrong list. - * - * This should never happen but it has and is a major problem since it - * locks up the search. - */ - public void pageFoundInWrongList() { - Storage storage = getStorage(); - - if (storage != null) { - PerstRoot root = (PerstRoot) storage.getRoot(); - Logger.error(this, "Page " + this + " found in wrong list. Will remove from all lists and put back."); - for (Status status : Status.values()) { - FieldIndex coll = root.getPageIndex(status); - coll.exclusiveLock(); - try { - coll.remove(this); - Logger.minor(this, "Page " + this + " was removed from " + status); - } catch (StorageError e) { - if(e.getErrorCode() == StorageError.KEY_NOT_FOUND) { - // This is the normal case. - } else { - Logger.error(this, "Error in storage when removing " + this + " from " + status + ".", e); - } - } finally { - coll.unlock(); - } - } - FieldIndex coll = root.getPageIndex(status); - coll.exclusiveLock(); - try { - coll.put(this); - } finally { - coll.unlock(); - } - } - } } diff --git a/src/plugins/Spider/db/PerstRoot.java b/src/plugins/Spider/db/PerstRoot.java index 1a54ce1..c8af075 100644 --- a/src/plugins/Spider/db/PerstRoot.java +++ b/src/plugins/Spider/db/PerstRoot.java @@ -41,6 +41,14 @@ private void create(Storage storage) { config = new Config(storage); } + /** + * Finds or creates pages in the database. + * + * @param uri The URI of the page to find. + * @param create if true then the page is created if it doesn't exist. + * @param comment is only used when create is true. + * @return the page. + */ public Page getPageByURI(FreenetURI uri, boolean create, String comment) { idPage.exclusiveLock(); uriPage.exclusiveLock(); @@ -65,6 +73,15 @@ public Page getPageByURI(FreenetURI uri, boolean create, String comment) { } } + /** + * Find a page in the database. + * @param uri The page to find. + * @return null if not found + */ + public Page getPageByURI(FreenetURI uri) { + return getPageByURI(uri, false, null); + } + public Page getPageById(long id) { idPage.sharedLock(); try { diff --git a/src/plugins/Spider/db/Status.java b/src/plugins/Spider/db/Status.java index c889fea..a55958b 100644 --- a/src/plugins/Spider/db/Status.java +++ b/src/plugins/Spider/db/Status.java @@ -10,6 +10,7 @@ */ public enum Status { NEW, // Newly found URIs, i.e. never fetched. + NEW_EDITION, // Updated edition NOT_PUSHED, /** * NOT_PUSHED, when LibraryBuffer is enabled, means we have successfully fetched the page but have not diff --git a/src/plugins/Spider/web/MainPage.java b/src/plugins/Spider/web/MainPage.java index 976fe4a..abcc4af 100644 --- a/src/plugins/Spider/web/MainPage.java +++ b/src/plugins/Spider/web/MainPage.java @@ -12,6 +12,9 @@ import java.util.ArrayList; import java.util.Iterator; import java.util.List; +import java.util.NoSuchElementException; + +import javax.naming.SizeLimitExceededException; import plugins.Spider.Spider; import plugins.Spider.db.Config; @@ -56,7 +59,12 @@ static class PageStatus { */ public void processPostRequest(HTTPRequest request, HTMLNode contentNode) { // Queue URI - String addURI = request.getPartAsString("addURI", 512); + String addURI = null; + try { + addURI = request.getPartAsStringThrowing("addURI", 512); + } catch (SizeLimitExceededException e1) { + } catch (NoSuchElementException e1) { + } if (addURI != null && addURI.length() != 0) { try { FreenetURI uri = new FreenetURI(addURI); @@ -82,7 +90,7 @@ public void writeContent(HTTPRequest request, HTMLNode contentNode) { HTMLNode overviewTable = contentNode.addChild("table", "class", "column"); HTMLNode overviewTableRow = overviewTable.addChild("tr"); - List runningFetch = spider.getRunningFetch(); + List runningFetch = spider.getRunningFetch(); Config config = spider.getConfig(); // Column 1 @@ -176,14 +184,15 @@ public void writeContent(HTTPRequest request, HTMLNode contentNode) { if (runningFetch.isEmpty()) { runningContent.addChild("#", "NO URI"); } else { + runningContent.addChild("#", "USKs shown without edition."); HTMLNode list = runningContent.addChild("ol", "style", "overflow: auto; white-space: nowrap;"); - Iterator pi = runningFetch.iterator(); + Iterator pi = runningFetch.iterator(); int maxURI = config.getMaxShownURIs(); for (int i = 0; i < maxURI && pi.hasNext(); i++) { - Page page = pi.next(); - HTMLNode litem = list.addChild("li", "title", page.getComment()); - litem.addChild("a", "href", "/freenet:" + page.getURI(), page.getURI()); + String runningURI = pi.next(); + HTMLNode litem = list.addChild("li"); + litem.addChild("a", "href", "/freenet:" + runningURI, runningURI); } } contentNode.addChild(runningBox); @@ -206,12 +215,12 @@ private PageStatus getPageStatus(Status status) { Iterator it = root.getPages(status); int showURI = spider.getConfig().getMaxShownURIs(); - List page = new ArrayList(); - while (page.size() < showURI && it.hasNext()) { - page.add(it.next()); + List pages = new ArrayList(); + while (pages.size() < showURI && it.hasNext()) { + pages.add(it.next()); } - return new PageStatus(count, page); + return new PageStatus(count, pages); } } @@ -228,6 +237,10 @@ private void listPages(PageStatus pageStatus, HTMLNode parent) { if (title == null) { title = ""; } + long edition = page.getEdition(); + if (edition != 0L) { + title = "Edition " + edition + " " + title; + } litem.addChild("p", " " + page.getLastChangeAsString() + " " + From e8976d38b3bf686222699840f9fcbe22f0e2b02c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Debora=20W=C3=B6pcke?= Date: Sat, 7 May 2022 13:32:54 +0200 Subject: [PATCH 31/42] Avoid having redirected USKs and KSKs ending up in DONE --- src/plugins/Spider/Spider.java | 26 +++++++++++++++++++------- 1 file changed, 19 insertions(+), 7 deletions(-) diff --git a/src/plugins/Spider/Spider.java b/src/plugins/Spider/Spider.java index a0424a6..8ab11b4 100644 --- a/src/plugins/Spider/Spider.java +++ b/src/plugins/Spider/Spider.java @@ -702,19 +702,31 @@ protected void onFailure(FetchException fe, ClientGetter getter) { return; } if (fe.newURI != null) { - // Cases are noticed when the USK is redirected, - // because of the missing meta-string, to its SSK. - // That is not good from the purpose of maintaining the USK - // in the index. FreenetURI newURI = fe.newURI; if (fe.mode == FetchException.FetchExceptionMode.NOT_ENOUGH_PATH_COMPONENTS) { if (uri.isUSK() && !uri.hasMetaStrings()) { newURI = uri.pushMetaString(""); } } - // redirect, mark as succeeded - queueURI(newURI, "redirect from " + getter.getURI()); - page.setStatus(Status.DONE, "Redirected to " + newURI + " because of " + fe.getMode()); + // mark as succeeded + Status whereTo = Status.DONE; + if (uri.isUSK()) { + whereTo = Status.PROCESSED_USK; + } else if (uri.isKSK()) { + whereTo = Status.PROCESSED_KSK; + } + page.setStatus(whereTo, "Redirected to " + newURI + " because of " + fe.getMode()); + // redirect. This is done in an independent Runnable to get its own lock. + final FreenetURI redirectedTo = newURI; + final FreenetURI redirectedFrom = getter.getURI(); + callbackExecutor.execute(new Runnable() { + @Override + public void run() { + // If this is a new Edition it is moved again from PROCESSED_USK to NEW_EDITION. + queueURI(redirectedTo, "redirect from " + redirectedFrom); + } + + }); } else if (fe.isFatal()) { // too many tries or fatal, mark as failed page.setStatus(Status.FATALLY_FAILED, "Fatal: " + fe.getMode()); From 131e6612412ba1a30219dc497504692fef5f9c4e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Debora=20W=C3=B6pcke?= Date: Sat, 21 May 2022 14:23:21 +0200 Subject: [PATCH 32/42] Separate control of different kinds of fetches Also prepare the database for remembering when a page was last fetched. --- src/plugins/Spider/Spider.java | 130 +++++++++++++++---------- src/plugins/Spider/db/Config.java | 79 +++++++++++---- src/plugins/Spider/db/Page.java | 3 + src/plugins/Spider/web/ConfigPage.java | 42 ++++---- src/plugins/Spider/web/MainPage.java | 47 +++++---- 5 files changed, 193 insertions(+), 108 deletions(-) diff --git a/src/plugins/Spider/Spider.java b/src/plugins/Spider/Spider.java index 8ab11b4..5faea15 100644 --- a/src/plugins/Spider/Spider.java +++ b/src/plugins/Spider/Spider.java @@ -85,7 +85,12 @@ public class Spider implements FredPlugin, FredPluginThreadless, FredPluginVersioned, FredPluginRealVersioned, FredPluginL10n, RequestClient { /** Document ID of fetching documents */ - protected Map runningFetch = Collections.synchronizedMap(new HashMap()); + protected Map> runningFetches = new HashMap>(); + { + for (Status status : Config.statusesToProcess) { + runningFetches.put(status, Collections.synchronizedMap(new HashMap())); + } + } private Map> runningFutures = Collections.synchronizedMap(new HashMap>()); @@ -94,7 +99,7 @@ public class Spider implements FredPlugin, FredPluginThreadless, */ protected Set allowedMIMETypes; - static int dbVersion = 47; + static int dbVersion = 48; static int version = 53; /** We use the standard http://127.0.0.1:8888/ for parsing HTML regardless of what the local @@ -128,7 +133,7 @@ public long getRealVersion() { private final Set subscribedToUSKs = new HashSet(); - private BulkPageIterator[] bulkPageIterators = new BulkPageIterator[Status.values().length]; + private Map bulkPageIterators = null; public int getLibraryBufferSize() { return librarybuffer.bufferUsageEstimate(); @@ -348,34 +353,39 @@ public Page next() { * Start requests from new and queued. */ private void startFetches() { - ArrayList toStart = null; synchronized (this) { if (stopped) { - bulkPageIterators[Status.NEW.ordinal()] = null; - bulkPageIterators[Status.NEW_EDITION.ordinal()] = null; - bulkPageIterators[Status.FAILED.ordinal()] = null; + bulkPageIterators = null; return; } - synchronized (runningFetch) { - int maxParallelRequests = getRoot().getConfig().getMaxParallelRequests(); - int running = runningFetch.size(); + if (bulkPageIterators == null) { + bulkPageIterators = new HashMap(); + } + } - if (maxParallelRequests <= running) { - return; - } + for (Status status : Config.statusesToProcess) { + ArrayList toStart = null; + synchronized (this) { + Map runningFetch = runningFetches.get(status); + synchronized (runningFetch) { + int maxParallelRequests = getRoot().getConfig().getMaxParallelRequests(status); + int running = runningFetch.size(); + + if (maxParallelRequests <= running) { + continue; + } - toStart = new ArrayList(maxParallelRequests - running); + toStart = new ArrayList(maxParallelRequests - running); - Status[] statuses = {Status.NEW, Status.NEW_EDITION, Status.FAILED}; - for (Status status : statuses) { - if (bulkPageIterators[status.ordinal()] == null) { - bulkPageIterators[status.ordinal()] = new BulkPageIterator(status); + if (!bulkPageIterators.containsKey(status)) { + bulkPageIterators.put(status, new BulkPageIterator(status)); } + BulkPageIterator bulkPageIterator = bulkPageIterators.get(status); while (running + toStart.size() < maxParallelRequests && - bulkPageIterators[status.ordinal()].hasNext(maxParallelRequests)) { - Page page = bulkPageIterators[status.ordinal()].next(); + bulkPageIterator.hasNext(maxParallelRequests)) { + Page page = bulkPageIterator.next(); Logger.debug(this, "Page " + page + " found in " + status + "."); // Skip if getting this page already if (runningFetch.containsKey(page.getURI())) continue; @@ -393,29 +403,29 @@ private void startFetches() { } } } - } - for (final ClientGetter g : toStart) { - try { - g.start(clientContext); - Logger.minor(this, g + " started"); - } catch (FetchException e) { - g.getClientCallback().onFailure(e, g); - continue; - } - ScheduledFuture future = callbackExecutor.scheduleWithFixedDelay(new Runnable() { - long lapsLeft = 10 * 60 * 60; // Ten hours - @Override - public void run() { - if (lapsLeft-- <= 0) { - g.cancel(clientContext); - Logger.minor(this, g + " aborted because of time-out"); - ScheduledFuture f = runningFutures.get(g); - f.cancel(false); - } + for (final ClientGetter g : toStart) { + try { + g.start(clientContext); + Logger.minor(this, g + " started"); + } catch (FetchException e) { + g.getClientCallback().onFailure(e, g); + continue; } - }, 10, 1, TimeUnit.SECONDS); - runningFutures.put(g, future); + ScheduledFuture future = callbackExecutor.scheduleWithFixedDelay(new Runnable() { + long lapsLeft = 10 * 60 * 60; // Ten hours + @Override + public void run() { + if (lapsLeft-- <= 0) { + g.cancel(clientContext); + Logger.minor(this, g + " aborted because of time-out"); + ScheduledFuture f = runningFutures.get(g); + f.cancel(false); + } + } + }, 10, 1, TimeUnit.SECONDS); + runningFutures.put(g, future); + } } } @@ -660,7 +670,7 @@ protected void onSuccess(FetchResult result, ClientGetter state) { synchronized (this) { if (page != null) { - runningFetch.remove(page.getURI()); + removeFromRunningFetches(page); } } } finally { @@ -679,6 +689,18 @@ protected void onSuccess(FetchResult result, ClientGetter state) { } } + private void removeFromRunningFetches(Page page) { + if (runningFetches != null) { + for (Status status : Config.statusesToProcess) { + if (runningFetches.containsKey(status)) { + if (runningFetches.get(status).remove(page.getURI()) != null) { + break; + } + } + } + } + } + protected void onFailure(FetchException fe, ClientGetter getter) { FreenetURI uri = getter.getURI(); Logger.minor(this, "Failed: " + uri + " : " + getter, fe); @@ -741,7 +763,7 @@ public void run() { throw new RuntimeException("Unexcepected exception in onFailure()", e); } finally { if (page != null) { - runningFetch.remove(page.getURI()); + removeFromRunningFetches(page); } if (!dbTransactionEnded) { Logger.minor(this, "rollback transaction", new Exception("debug")); @@ -761,15 +783,17 @@ public void terminate(){ synchronized (this) { stopped = true; - for (Map.Entry me : runningFetch.entrySet()) { - ClientGetter getter = me.getValue(); - Logger.minor(this, "Canceling request" + getter); - getter.cancel(clientContext); + for (Status status : Config.statusesToProcess) { + for (Map.Entry me : runningFetches.get(status).entrySet()) { + ClientGetter getter = me.getValue(); + Logger.minor(this, "Canceling request" + getter); + getter.cancel(clientContext); + } + runningFetches.get(status).clear(); } for (SubscribedToUSK stu : new HashSet(subscribedToUSKs)) { stu.unsubscribe(); } - runningFetch.clear(); callbackExecutor.shutdownNow(); } librarybuffer.terminate(); @@ -1040,9 +1064,13 @@ public PageMaker getPageMaker() { return pageMaker; } - public List getRunningFetch() { - synchronized (runningFetch) { - return new ArrayList(runningFetch.keySet()); + public List getRunningFetch(Status status) { + synchronized (runningFetches) { + if (runningFetches != null) { + return new ArrayList(runningFetches.get(status).keySet()); + } else { + return new ArrayList(); + } } } diff --git a/src/plugins/Spider/db/Config.java b/src/plugins/Spider/db/Config.java index c12c506..22f394f 100644 --- a/src/plugins/Spider/db/Config.java +++ b/src/plugins/Spider/db/Config.java @@ -12,13 +12,20 @@ public class Config extends Persistent implements Cloneable { + public static final Status[] statusesToProcess = {Status.NEW, Status.NEW_EDITION, Status.FAILED}; + public static final boolean[] workingRelationsToProcess = {true, false}; private String indexTitle; private String indexOwner; private String indexOwnerEmail; private int maxShownURIs; - private int maxParallelRequestsWorking; - private int maxParallelRequestsNonWorking; + /** working, status + * + * This should be an array with dimensions working and status. + * This is problematic in the database so it will instead be stored + * as a string of semicolon-separated ints. + */ + private String maxParallelRequests; private int beginWorkingPeriod; // Between 0 and 23 private int endWorkingPeriod; // Between 0 and 23 private String[] badlistedExtensions; @@ -40,8 +47,7 @@ public Config(Storage storage) { maxShownURIs = 50; - maxParallelRequestsWorking = 0; - maxParallelRequestsNonWorking = 0; + maxParallelRequests = ""; beginWorkingPeriod = 23; endWorkingPeriod = 7; @@ -111,25 +117,64 @@ public synchronized String getIndexOwnerEmail() { return indexOwnerEmail; } - public synchronized void setMaxParallelRequestsWorking(int maxParallelRequests) { - assert !isPersistent(); - this.maxParallelRequestsWorking = maxParallelRequests; + private int workingIndex(boolean b) { + for (int i = 0; i < workingRelationsToProcess.length; i++) { + if (workingRelationsToProcess[i] == b) { + return i; + } + } + throw new RuntimeException(); + } + + private int statusIndex(Status status) { + for (int i = 0; i < statusesToProcess.length; i++) { + if (statusesToProcess[i] == status) { + return i; + } + } + throw new RuntimeException(); } - public synchronized int getMaxParallelRequestsWorking() { - return maxParallelRequestsWorking; + private int[][] unpackMaxParallelRequests() { + int[][] requests = new int[workingRelationsToProcess.length][statusesToProcess.length]; + String[] arr = maxParallelRequests.split(";"); + int arrIndex = 0; + for (int w = 0; w < workingRelationsToProcess.length; w++) { + for (int s= 0; s < statusesToProcess.length; s++) { + requests[w][s] = 0; + if (arrIndex < arr.length && !arr[arrIndex].equals("")) { + try { + requests[w][s] = Integer.parseInt(arr[arrIndex++]); + } catch (NumberFormatException e) { + // Ignore if we can't do the conversion. + } + } + } + } + return requests; } - public synchronized void setMaxParallelRequestsNonWorking(int maxParallelRequests) { + public synchronized void setMaxParallelRequests(boolean working, Status status, int maxParallelRequests) { assert !isPersistent(); - this.maxParallelRequestsNonWorking = maxParallelRequests; + int[][] requests = unpackMaxParallelRequests(); + requests[workingIndex(working)][statusIndex(status)] = maxParallelRequests; + + StringBuilder sb = new StringBuilder(); + for (int w = 0; w < workingRelationsToProcess.length; w++) { + for (int s= 0; s < statusesToProcess.length; s++) { + sb.append(Integer.toString(requests[w][s])); + sb.append(";"); + } + } + this.maxParallelRequests = sb.toString(); } - public synchronized int getMaxParallelRequestsNonWorking() { - return maxParallelRequestsNonWorking; + public synchronized int getMaxParallelRequests(boolean working, Status status) { + int[][] requests = unpackMaxParallelRequests(); + return requests[workingIndex(working)][statusIndex(status)]; } - public synchronized int getMaxParallelRequests() { + public synchronized int getMaxParallelRequests(Status status) { int actualHour = Calendar.getInstance().get(Calendar.HOUR_OF_DAY); Boolean isWorking = true; @@ -145,11 +190,7 @@ public synchronized int getMaxParallelRequests() { isWorking = (actualHour > this.getBeginWorkingPeriod() || actualHour < this.getEndWorkingPeriod()); } - if(isWorking) { - return this.getMaxParallelRequestsWorking(); - } else { - return this.getMaxParallelRequestsNonWorking(); - } + return this.getMaxParallelRequests(isWorking, status); } public synchronized void setBeginWorkingPeriod(int beginWorkingPeriod) { diff --git a/src/plugins/Spider/db/Page.java b/src/plugins/Spider/db/Page.java index 3bd4945..5017ef6 100644 --- a/src/plugins/Spider/db/Page.java +++ b/src/plugins/Spider/db/Page.java @@ -27,6 +27,8 @@ public class Page extends Persistent implements Comparable { protected Status status; /** Last Change Time */ protected long lastChange; + /** Last Fetched Time */ + protected long lastFetched; /** Comment, for debugging */ protected String comment; @@ -39,6 +41,7 @@ public Page() { this.comment = comment; this.status = Status.NEW; this.lastChange = System.currentTimeMillis(); + this.lastFetched = 0L; storage.makePersistent(this); } diff --git a/src/plugins/Spider/web/ConfigPage.java b/src/plugins/Spider/web/ConfigPage.java index 39da6b3..393ca1d 100644 --- a/src/plugins/Spider/web/ConfigPage.java +++ b/src/plugins/Spider/web/ConfigPage.java @@ -6,6 +6,7 @@ import plugins.Spider.Spider; import plugins.Spider.db.Config; +import plugins.Spider.db.Status; import freenet.clients.http.PageMaker; import freenet.pluginmanager.PluginRespirator; import freenet.support.HTMLNode; @@ -34,13 +35,16 @@ class ConfigPage implements WebPage { public synchronized void processPostRequest(HTTPRequest request, HTMLNode contentNode) { config = spider.getConfig().clone(); - if (request.isPartSet("maxParallelRequestsWorking")) { - int v = request.getIntPart("maxParallelRequestsWorking", config.getMaxParallelRequestsWorking()); - config.setMaxParallelRequestsWorking(v); - } - if (request.isPartSet("maxParallelRequestsNonWorking")) { - int v = request.getIntPart("maxParallelRequestsNonWorking", config.getMaxParallelRequestsNonWorking()); - config.setMaxParallelRequestsNonWorking(v); + for (int w = 0; w < Config.workingRelationsToProcess.length; w++) { + boolean working = Config.workingRelationsToProcess[w]; + for (int s = 0; s < Config.statusesToProcess.length; s++) { + Status status = Config.statusesToProcess[s]; + final String name = "maxParallelRequests" + working + status; + if (request.isPartSet(name)) { + int v = request.getIntPart(name, config.getMaxParallelRequests(working, status)); + config.setMaxParallelRequests(working, status, v); + } + } } if (request.isPartSet("beginWorkingPeriod")) { int v = request.getIntPart("beginWorkingPeriod", config.getBeginWorkingPeriod()); @@ -115,16 +119,20 @@ public void writeContent(HTTPRequest request, HTMLNode contentNode) { configForm.addChild("div", "class", "configprefix", "Spider Options"); HTMLNode spiderConfig = configForm.addChild("ul", "class", "config"); - addConfig(spiderConfig, // - "Max Parallel Requests (Working)", "Maximum number of parallel requests if we are in the working period.", // - "maxParallelRequestsWorking", // - new String[] { "0", "1", "2", "5", "10", "15", "25", "50", "75", "100", "125", "150", "200", "250", "500", "1000" }, // - Integer.toString(config.getMaxParallelRequestsWorking())); - addConfig(spiderConfig, // - "Max Parallel Requests (Non-Working)", "Maximum number of parallel requests if we are not in the working period.", // - "maxParallelRequestsNonWorking", // - new String[] { "0", "1", "2", "5", "10", "15", "25", "50", "75", "100", "125", "150", "200", "250", "500", "1000" }, // - Integer.toString(config.getMaxParallelRequestsNonWorking())); + for (int w = 0; w < Config.workingRelationsToProcess.length; w++) { + boolean working = Config.workingRelationsToProcess[w]; + for (int s = 0; s < Config.statusesToProcess.length; s++) { + Status status = Config.statusesToProcess[s]; + final String name = "maxParallelRequests" + working + status; + addConfig(spiderConfig, // + "Max Parallel for " + status + " (" + (working ? "Working" : "Non-Working") + ")", + "Maximum number of parallel requests from " + status + + " if we are in the " + (working ? "working" : "non-working" ) + " period.", + name, // + new String[] { "0", "1", "2", "5", "10", "15", "25", "50", "75", "100", "125", "150", "200", "250", "500", "1000" }, // + Integer.toString(config.getMaxParallelRequests(working, status))); + } + } addConfig(spiderConfig, // "Working period beginning hour", "Beginning hour of the Working period.", // diff --git a/src/plugins/Spider/web/MainPage.java b/src/plugins/Spider/web/MainPage.java index abcc4af..a287e6b 100644 --- a/src/plugins/Spider/web/MainPage.java +++ b/src/plugins/Spider/web/MainPage.java @@ -90,15 +90,19 @@ public void writeContent(HTTPRequest request, HTMLNode contentNode) { HTMLNode overviewTable = contentNode.addChild("table", "class", "column"); HTMLNode overviewTableRow = overviewTable.addChild("tr"); - List runningFetch = spider.getRunningFetch(); Config config = spider.getConfig(); // Column 1 HTMLNode nextTableCell = overviewTableRow.addChild("td", "class", "first"); HTMLNode statusContent = pageMaker.getInfobox("#", "Spider Status", nextTableCell); - statusContent.addChild("#", "Running Request: " + runningFetch.size() + "/" - + config.getMaxParallelRequests()); - statusContent.addChild("br"); + for (int i = 0; i < Config.statusesToProcess.length; i++) { + Status status = Config.statusesToProcess[i]; + List runningFetch = spider.getRunningFetch(status); + statusContent.addChild("#", "Running Request for " + status + ": " + runningFetch.size() + "/" + + config.getMaxParallelRequests(status)); + statusContent.addChild("br"); + + } for (Status status : Status.values()) { statusContent.addChild("#", status + ": " + getPageStatus(status).count); statusContent.addChild("br"); @@ -176,26 +180,27 @@ public void writeContent(HTTPRequest request, HTMLNode contentNode) { } } - InfoboxNode running = pageMaker.getInfobox("Running URI"); - HTMLNode runningBox = running.outer; - runningBox.addAttribute("style", "right: 0;"); - HTMLNode runningContent = running.content; + for (Status status : Config.statusesToProcess) { + List runningFetch = spider.getRunningFetch(status); + if (!runningFetch.isEmpty()) { + InfoboxNode running = pageMaker.getInfobox("Running URIs for " + status); + HTMLNode runningBox = running.outer; + runningBox.addAttribute("style", "right: 0;"); + HTMLNode runningContent = running.content; - if (runningFetch.isEmpty()) { - runningContent.addChild("#", "NO URI"); - } else { - runningContent.addChild("#", "USKs shown without edition."); - HTMLNode list = runningContent.addChild("ol", "style", "overflow: auto; white-space: nowrap;"); - - Iterator pi = runningFetch.iterator(); - int maxURI = config.getMaxShownURIs(); - for (int i = 0; i < maxURI && pi.hasNext(); i++) { - String runningURI = pi.next(); - HTMLNode litem = list.addChild("li"); - litem.addChild("a", "href", "/freenet:" + runningURI, runningURI); + runningContent.addChild("#", "USKs shown without edition."); + HTMLNode list = runningContent.addChild("ol", "style", "overflow: auto; white-space: nowrap;"); + + Iterator pi = runningFetch.iterator(); + int maxURI = config.getMaxShownURIs(); + for (int i = 0; i < maxURI && pi.hasNext(); i++) { + String runningURI = pi.next(); + HTMLNode litem = list.addChild("li"); + litem.addChild("a", "href", "/freenet:" + runningURI, runningURI); + } + contentNode.addChild(runningBox); } } - contentNode.addChild(runningBox); for (Status status : Status.values()) { InfoboxNode d = pageMaker.getInfobox(status + " URIs"); From 5293d4d00c746bd49b8e0a98e10583b0f7c5d107 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Debora=20W=C3=B6pcke?= Date: Sun, 5 Jun 2022 14:57:30 +0200 Subject: [PATCH 33/42] Process fetched USKs with the oldest first Never fetched pages are in NEW, NEW_EDITION is processed in lastFetched order. Only the top page of an USK is subscribed (PROCESSED_USK). Sub-pages are put into DONE. --- src/plugins/Spider/Spider.java | 24 +++++++++++++++++------- src/plugins/Spider/db/Page.java | 28 ++++++++++++++++++++++++++-- src/plugins/Spider/db/PerstRoot.java | 6 +++++- src/plugins/Spider/web/MainPage.java | 4 ++++ 4 files changed, 52 insertions(+), 10 deletions(-) diff --git a/src/plugins/Spider/Spider.java b/src/plugins/Spider/Spider.java index 5faea15..2fefc24 100644 --- a/src/plugins/Spider/Spider.java +++ b/src/plugins/Spider/Spider.java @@ -227,7 +227,11 @@ public void queueURI(FreenetURI uri, String comment) { if (edition != NO_USK) { final long oldEdition = page.getEdition(); if (edition > oldEdition) { - page.setStatus(edition, Status.NEW_EDITION, "New edition replacing " + oldEdition); + Status whereTo = Status.NEW_EDITION; + if (!page.hasBeenFetched()) { + whereTo = Status.NEW; + } + page.setStatus(edition, whereTo, "New edition replacing " + oldEdition); } } db.endThreadTransaction(); @@ -636,6 +640,7 @@ protected void onSuccess(FetchResult result, ClientGetter state) { } pageCallBack.finish(); page.setStatus(Status.NOT_PUSHED); + page.setLastFetched(); librarybuffer.maybeSend(); } catch (UnsafeContentTypeException e) { @@ -724,12 +729,6 @@ protected void onFailure(FetchException fe, ClientGetter getter) { return; } if (fe.newURI != null) { - FreenetURI newURI = fe.newURI; - if (fe.mode == FetchException.FetchExceptionMode.NOT_ENOUGH_PATH_COMPONENTS) { - if (uri.isUSK() && !uri.hasMetaStrings()) { - newURI = uri.pushMetaString(""); - } - } // mark as succeeded Status whereTo = Status.DONE; if (uri.isUSK()) { @@ -737,6 +736,13 @@ protected void onFailure(FetchException fe, ClientGetter getter) { } else if (uri.isKSK()) { whereTo = Status.PROCESSED_KSK; } + FreenetURI newURI = fe.newURI; + if (fe.mode == FetchException.FetchExceptionMode.NOT_ENOUGH_PATH_COMPONENTS) { + if (uri.isUSK() && !uri.hasMetaStrings()) { + newURI = uri.pushMetaString(""); + whereTo = Status.DONE; + } + } page.setStatus(whereTo, "Redirected to " + newURI + " because of " + fe.getMode()); // redirect. This is done in an independent Runnable to get its own lock. final FreenetURI redirectedTo = newURI; @@ -1112,6 +1118,10 @@ public void donePages() { to = Status.PROCESSED_KSK; } else if (uri.isSSK()) { to = Status.DONE; + } else if (uri.isUSK() && + uri.hasMetaStrings() && !uri.getMetaString().equals("")) { + // This is not the top element of this USK. + to = Status.DONE; } else if (uri.isUSK()) { to = Status.PROCESSED_USK; subscribeUSK(uri); diff --git a/src/plugins/Spider/db/Page.java b/src/plugins/Spider/db/Page.java index 5017ef6..49d225e 100644 --- a/src/plugins/Spider/db/Page.java +++ b/src/plugins/Spider/db/Page.java @@ -27,7 +27,12 @@ public class Page extends Persistent implements Comparable { protected Status status; /** Last Change Time */ protected long lastChange; - /** Last Fetched Time */ + /** Last Fetched Time + * + * This is for the case when many USK pages are updated more often + * than they are fetched. In that case, this is used to prioritize + * the update of older pages over pages that were recently fetched. + */ protected long lastFetched; /** Comment, for debugging */ protected String comment; @@ -41,7 +46,7 @@ public Page() { this.comment = comment; this.status = Status.NEW; this.lastChange = System.currentTimeMillis(); - this.lastFetched = 0L; + this.lastFetched = 0L; // 0 means never fetched. storage.makePersistent(this); } @@ -126,6 +131,25 @@ public Date getLastChange() { return new Date(lastChange); } + public void setLastFetched() { + lastFetched = System.currentTimeMillis(); + } + + public boolean hasBeenFetched() { + return lastFetched != 0L; + } + + public Date getLastFetched() { + return new Date(lastFetched); + } + + public String getLastFetchedAsString() { + if (lastFetched > 0L) { + return new Date(lastFetched).toString(); + } + return ""; + } + @Override public int hashCode() { return (int) (id ^ (id >>> 32)); diff --git a/src/plugins/Spider/db/PerstRoot.java b/src/plugins/Spider/db/PerstRoot.java index c8af075..f4d0a7a 100644 --- a/src/plugins/Spider/db/PerstRoot.java +++ b/src/plugins/Spider/db/PerstRoot.java @@ -35,7 +35,11 @@ private void create(Storage storage) { uriPage = storage.createFieldIndex(Page.class, "uri", true); statusPages = new FieldIndex[Status.values().length]; for (Status status : Status.values()) { - statusPages[status.ordinal()] = storage.createFieldIndex(Page.class, "lastChange", true); + String fieldName = "lastChange"; + if (status == Status.NEW_EDITION) { + fieldName = "lastFetched"; + } + statusPages[status.ordinal()] = storage.createFieldIndex(Page.class, fieldName, true); } config = new Config(storage); diff --git a/src/plugins/Spider/web/MainPage.java b/src/plugins/Spider/web/MainPage.java index a287e6b..e0c4fcd 100644 --- a/src/plugins/Spider/web/MainPage.java +++ b/src/plugins/Spider/web/MainPage.java @@ -242,6 +242,10 @@ private void listPages(PageStatus pageStatus, HTMLNode parent) { if (title == null) { title = ""; } + String changed = page.getLastFetchedAsString(); + if (!changed.equals("")) { + title = "Last changed " + changed + " " + title; + } long edition = page.getEdition(); if (edition != 0L) { title = "Edition " + edition + " " + title; From dfff40fd6639d8bc9c83558eb0f4598e7940d855 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Debora=20W=C3=B6pcke?= Date: Sun, 5 Jun 2022 20:41:59 +0200 Subject: [PATCH 34/42] Count editionsFound in the right variable Remove the no longer used variable. --- src/plugins/Spider/Spider.java | 10 +++------- src/plugins/Spider/web/MainPage.java | 4 +--- 2 files changed, 4 insertions(+), 10 deletions(-) diff --git a/src/plugins/Spider/Spider.java b/src/plugins/Spider/Spider.java index 2fefc24..40e0234 100644 --- a/src/plugins/Spider/Spider.java +++ b/src/plugins/Spider/Spider.java @@ -128,10 +128,9 @@ public long getRealVersion() { private LibraryBuffer librarybuffer; private final AtomicLong lastRequestFinishedAt = new AtomicLong(); - private final AtomicInteger newUSKs = new AtomicInteger(); private final AtomicInteger editionsFound = new AtomicInteger(); - private final Set subscribedToUSKs = new HashSet(); + private final Set subscribedToUSKs = Collections.synchronizedSet(new HashSet()); private Map bulkPageIterators = null; @@ -155,10 +154,6 @@ public int getSubscribedToUSKs() { return subscribedToUSKs.size(); } - public int getNewUSKs() { - return newUSKs.get(); - } - public int getEditionsFound() { return editionsFound.get(); } @@ -256,6 +251,7 @@ private class SubscribedToUSK implements USKCallback { try { usk = USK.create(uri); } catch (MalformedURLException e) { + Logger.error(this, "Cannot subscribe to " + uri + ".", e); return; } (clientContext.uskManager).subscribe(usk, this, false, Spider.this); @@ -265,7 +261,7 @@ private class SubscribedToUSK implements USKCallback { public void onFoundEdition(long l, USK key, ClientContext context, boolean metadata, short codec, byte[] data, boolean newKnownGood, boolean newSlot) { Logger.minor(this, "Found new Edition for " + key + ", newKnownGood=" + newKnownGood + " newSlot=" + newSlot + "."); - newUSKs.getAndIncrement(); + editionsFound.getAndIncrement(); subscribedToUSKs.remove(this); FreenetURI uri = key.getURI(); diff --git a/src/plugins/Spider/web/MainPage.java b/src/plugins/Spider/web/MainPage.java index e0c4fcd..d8cb6bb 100644 --- a/src/plugins/Spider/web/MainPage.java +++ b/src/plugins/Spider/web/MainPage.java @@ -111,9 +111,7 @@ public void writeContent(HTTPRequest request, HTMLNode contentNode) { statusContent.addChild("br"); statusContent.addChild("#", "Subscribed USKs: " + spider.getSubscribedToUSKs()); statusContent.addChild("br"); - statusContent.addChild("#", "URIs replaced by new USKs: " + spider.getNewUSKs()); - statusContent.addChild("br"); - statusContent.addChild("#", "Found editions: " + spider.getEditionsFound()); + statusContent.addChild("#", "Found new editions: " + spider.getEditionsFound()); statusContent.addChild("br"); statusContent.addChild("#", "Library buffer size: "+spider.getLibraryBufferSize()); long lastRequestFinishedAt = spider.getLastRequestFinishedAt(); From da69a1b95f1efcd92b9461755ef8284a94fa5704 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Debora=20W=C3=B6pcke?= Date: Sun, 19 Jun 2022 19:47:10 +0200 Subject: [PATCH 35/42] Delay queueing of new editions Avoid moving the page between lists if the status is not changed. --- src/plugins/Spider/Spider.java | 14 ++++++++++---- src/plugins/Spider/db/Page.java | 3 +-- 2 files changed, 11 insertions(+), 6 deletions(-) diff --git a/src/plugins/Spider/Spider.java b/src/plugins/Spider/Spider.java index 40e0234..0c991ef 100644 --- a/src/plugins/Spider/Spider.java +++ b/src/plugins/Spider/Spider.java @@ -243,6 +243,7 @@ public void queueURI(FreenetURI uri, String comment) { } private class SubscribedToUSK implements USKCallback { + private static final int DELAY_IN_MINUTES_AFTER_NEW_EDITION_SEEN = 10; private FreenetURI uri; USK usk; @@ -258,14 +259,19 @@ private class SubscribedToUSK implements USKCallback { } @Override - public void onFoundEdition(long l, USK key, ClientContext context, boolean metadata, + public void onFoundEdition(long l, final USK key, ClientContext context, boolean metadata, short codec, byte[] data, boolean newKnownGood, boolean newSlot) { Logger.minor(this, "Found new Edition for " + key + ", newKnownGood=" + newKnownGood + " newSlot=" + newSlot + "."); editionsFound.getAndIncrement(); - subscribedToUSKs.remove(this); - FreenetURI uri = key.getURI(); + final FreenetURI uri = key.getURI(); - queueURI(uri, "USK found edition " + uri); + callbackExecutor.schedule(new Runnable() { + @Override + public void run() { + Logger.debug(this, "Queueing new Edition for " + key + "."); + queueURI(uri, "USK found edition " + uri); + } + }, DELAY_IN_MINUTES_AFTER_NEW_EDITION_SEEN, TimeUnit.MINUTES); } public void unsubscribe() { diff --git a/src/plugins/Spider/db/Page.java b/src/plugins/Spider/db/Page.java index 49d225e..ed44c48 100644 --- a/src/plugins/Spider/db/Page.java +++ b/src/plugins/Spider/db/Page.java @@ -114,9 +114,8 @@ public long getId() { public void setPageTitle(String pageTitle) { Logger.debug(this, "New page title for " + this); - preModify(); this.pageTitle = pageTitle; - postModify(); + modify(); } public String getPageTitle() { From 556df6b2c25f848baf53d787a20acaa426bf960a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Debora=20W=C3=B6pcke?= Date: Fri, 15 Jul 2022 15:30:19 +0200 Subject: [PATCH 36/42] Cleanup log message --- src/plugins/Spider/Spider.java | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/src/plugins/Spider/Spider.java b/src/plugins/Spider/Spider.java index 0c991ef..b7b6032 100644 --- a/src/plugins/Spider/Spider.java +++ b/src/plugins/Spider/Spider.java @@ -476,7 +476,9 @@ private void subscribeAllUSKs() { private class ClientGetterCallback implements ClientGetCallback { @Override public void onFailure(FetchException e, ClientGetter state) { - Logger.minor(this, "onFailure: " + state.getURI() + " (q:" + callbackExecutor.getQueue().size() + ")"); + Logger.minor(this, + "onFailure: " + state.getURI() + " (q:" + callbackExecutor.getQueue().size() + ")", + e); removeFuture(state); if (stopped) return; @@ -708,9 +710,16 @@ private void removeFromRunningFetches(Page page) { } } + /** + * Do what needs to be done when a fetch request has failed. + * + * @param fe Is the exception that make it fail. + * Used to decide what to do. + * @param getter is the ClientGetter that failed. + */ protected void onFailure(FetchException fe, ClientGetter getter) { - FreenetURI uri = getter.getURI(); - Logger.minor(this, "Failed: " + uri + " : " + getter, fe); + final FreenetURI uri = getter.getURI(); + Logger.minor(this, "Failed: " + uri + " : " + getter); synchronized (this) { if (stopped) return; @@ -748,7 +757,7 @@ protected void onFailure(FetchException fe, ClientGetter getter) { page.setStatus(whereTo, "Redirected to " + newURI + " because of " + fe.getMode()); // redirect. This is done in an independent Runnable to get its own lock. final FreenetURI redirectedTo = newURI; - final FreenetURI redirectedFrom = getter.getURI(); + final FreenetURI redirectedFrom = uri; callbackExecutor.execute(new Runnable() { @Override public void run() { From 42df11dbc0da44814be680bd123141411fc71945 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Debora=20W=C3=B6pcke?= Date: Fri, 15 Jul 2022 17:25:13 +0200 Subject: [PATCH 37/42] Index runningFetches on FreenetURI instead of String --- src/plugins/Spider/Spider.java | 42 ++++++++++++++++------------------ 1 file changed, 20 insertions(+), 22 deletions(-) diff --git a/src/plugins/Spider/Spider.java b/src/plugins/Spider/Spider.java index b7b6032..b9e7643 100644 --- a/src/plugins/Spider/Spider.java +++ b/src/plugins/Spider/Spider.java @@ -85,10 +85,10 @@ public class Spider implements FredPlugin, FredPluginThreadless, FredPluginVersioned, FredPluginRealVersioned, FredPluginL10n, RequestClient { /** Document ID of fetching documents */ - protected Map> runningFetches = new HashMap>(); + protected Map> runningFetches = new HashMap>(); { for (Status status : Config.statusesToProcess) { - runningFetches.put(status, Collections.synchronizedMap(new HashMap())); + runningFetches.put(status, Collections.synchronizedMap(new HashMap())); } } @@ -373,7 +373,7 @@ private void startFetches() { for (Status status : Config.statusesToProcess) { ArrayList toStart = null; synchronized (this) { - Map runningFetch = runningFetches.get(status); + Map runningFetch = runningFetches.get(status); synchronized (runningFetch) { int maxParallelRequests = getRoot().getConfig().getMaxParallelRequests(status); int running = runningFetch.size(); @@ -397,11 +397,15 @@ private void startFetches() { if (runningFetch.containsKey(page.getURI())) continue; try { - ClientGetter getter = makeGetter(page); + FreenetURI uri = new FreenetURI(page.getURI()); + if (uri.isUSK()) { + uri = uri.setSuggestedEdition(page.getEdition()); + } + ClientGetter getter = makeGetter(uri); Logger.minor(this, "Starting new " + getter + " " + page); toStart.add(getter); - runningFetch.put(page.getURI(), getter); + runningFetch.put(uri, getter); } catch (MalformedURLException e) { Logger.error(this, "IMPOSSIBLE-Malformed URI: " + page, e); page.setStatus(Status.FATALLY_FAILED, "MalformedURLException"); @@ -513,11 +517,7 @@ public RequestClient getRequestClient() { } } - private ClientGetter makeGetter(Page page) throws MalformedURLException { - FreenetURI uri = new FreenetURI(page.getURI()); - if (uri.isUSK()) { - uri = uri.setSuggestedEdition(page.getEdition()); - } + private ClientGetter makeGetter(FreenetURI uri) throws MalformedURLException { ClientGetter getter = new ClientGetter(new ClientGetterCallback(), uri, ctx, getPollingPriorityProgress(), null); @@ -678,9 +678,7 @@ protected void onSuccess(FetchResult result, ClientGetter state) { data.free(); synchronized (this) { - if (page != null) { - removeFromRunningFetches(page); - } + removeFromRunningFetches(uri); } } finally { if (!dbTransactionEnded) { @@ -698,11 +696,11 @@ protected void onSuccess(FetchResult result, ClientGetter state) { } } - private void removeFromRunningFetches(Page page) { + private void removeFromRunningFetches(FreenetURI uri) { if (runningFetches != null) { for (Status status : Config.statusesToProcess) { if (runningFetches.containsKey(status)) { - if (runningFetches.get(status).remove(page.getURI()) != null) { + if (runningFetches.get(status).remove(uri) != null) { break; } } @@ -779,9 +777,7 @@ public void run() { Logger.error(this, "Unexcepected exception in onFailure(): " + e, e); throw new RuntimeException("Unexcepected exception in onFailure()", e); } finally { - if (page != null) { - removeFromRunningFetches(page); - } + removeFromRunningFetches(uri); if (!dbTransactionEnded) { Logger.minor(this, "rollback transaction", new Exception("debug")); db.rollbackThreadTransaction(); @@ -801,7 +797,7 @@ public void terminate(){ stopped = true; for (Status status : Config.statusesToProcess) { - for (Map.Entry me : runningFetches.get(status).entrySet()) { + for (Map.Entry me : runningFetches.get(status).entrySet()) { ClientGetter getter = me.getValue(); Logger.minor(this, "Canceling request" + getter); getter.cancel(clientContext); @@ -1082,13 +1078,15 @@ public PageMaker getPageMaker() { } public List getRunningFetch(Status status) { + List result = new ArrayList(); synchronized (runningFetches) { if (runningFetches != null) { - return new ArrayList(runningFetches.get(status).keySet()); - } else { - return new ArrayList(); + for (FreenetURI uri : runningFetches.get(status).keySet()) { + result.add(uri.toString()); + } } } + return result; } public PluginRespirator getPluginRespirator() { From c5da094838b8f8056bb61f47cfeda052f9d1981c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Debora=20W=C3=B6pcke?= Date: Sat, 16 Jul 2022 14:33:58 +0200 Subject: [PATCH 38/42] Avoid having onFailure called before the future is set up Also improve the logging of the ClientGetterCallback. --- src/plugins/Spider/Spider.java | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/src/plugins/Spider/Spider.java b/src/plugins/Spider/Spider.java index b9e7643..66ecc23 100644 --- a/src/plugins/Spider/Spider.java +++ b/src/plugins/Spider/Spider.java @@ -415,13 +415,6 @@ private void startFetches() { } for (final ClientGetter g : toStart) { - try { - g.start(clientContext); - Logger.minor(this, g + " started"); - } catch (FetchException e) { - g.getClientCallback().onFailure(e, g); - continue; - } ScheduledFuture future = callbackExecutor.scheduleWithFixedDelay(new Runnable() { long lapsLeft = 10 * 60 * 60; // Ten hours @Override @@ -435,6 +428,13 @@ public void run() { } }, 10, 1, TimeUnit.SECONDS); runningFutures.put(g, future); + try { + g.start(clientContext); + Logger.minor(this, g + " started"); + } catch (FetchException e) { + g.getClientCallback().onFailure(e, g); + continue; + } } } } @@ -481,7 +481,7 @@ private class ClientGetterCallback implements ClientGetCallback { @Override public void onFailure(FetchException e, ClientGetter state) { Logger.minor(this, - "onFailure: " + state.getURI() + " (q:" + callbackExecutor.getQueue().size() + ")", + state + " onFailure: " + state.getURI() + " (q:" + callbackExecutor.getQueue().size() + ")", e); removeFuture(state); @@ -492,7 +492,7 @@ public void onFailure(FetchException e, ClientGetter state) { @Override public void onSuccess(final FetchResult result, final ClientGetter state) { - Logger.minor(this, "onSuccess: " + state.getURI() + " (q:" + callbackExecutor.getQueue().size() + ")"); + Logger.minor(this, state + " onSuccess: " + state.getURI() + " (q:" + callbackExecutor.getQueue().size() + ")"); removeFuture(state); if (stopped) return; From e30f577b382f7d387cd5f45e1efb5705e8a80272 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Debora=20W=C3=B6pcke?= Date: Mon, 18 Jul 2022 14:39:37 +0200 Subject: [PATCH 39/42] Fix problem caused by changing runningFetches to FreenetURI --- src/plugins/Spider/Spider.java | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/plugins/Spider/Spider.java b/src/plugins/Spider/Spider.java index 66ecc23..e9932c5 100644 --- a/src/plugins/Spider/Spider.java +++ b/src/plugins/Spider/Spider.java @@ -393,11 +393,11 @@ private void startFetches() { bulkPageIterator.hasNext(maxParallelRequests)) { Page page = bulkPageIterator.next(); Logger.debug(this, "Page " + page + " found in " + status + "."); - // Skip if getting this page already - if (runningFetch.containsKey(page.getURI())) continue; - try { FreenetURI uri = new FreenetURI(page.getURI()); + // Skip if getting this page already + if (runningFetch.containsKey(uri)) continue; + if (uri.isUSK()) { uri = uri.setSuggestedEdition(page.getEdition()); } From 1738a4a44c871dabee818c740ea2cb2efeb7b372 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Debora=20W=C3=B6pcke?= Date: Mon, 18 Jul 2022 14:40:45 +0200 Subject: [PATCH 40/42] Shorten the bulk size and increase the frequency --- src/plugins/Spider/Spider.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/plugins/Spider/Spider.java b/src/plugins/Spider/Spider.java index e9932c5..23d6c3f 100644 --- a/src/plugins/Spider/Spider.java +++ b/src/plugins/Spider/Spider.java @@ -301,8 +301,8 @@ private void subscribeUSK(FreenetURI uri) { class BulkPageIterator implements Iterator { private Status queue; private Deque list = new LinkedList(); - private int BULK_FETCH_SIZE = 1000; - private long TIME_TO_DEFER_DATABASE_READ = TimeUnit.SECONDS.toMillis(30); + private int BULK_FETCH_SIZE = 100; + private long TIME_TO_DEFER_DATABASE_READ = TimeUnit.SECONDS.toMillis(10); private Date lastPoll = new Date(); BulkPageIterator(Status status) { From 983065e184788a2eac8a47082ed6b8416a6716ea Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Debora=20W=C3=B6pcke?= Date: Fri, 30 Sep 2022 19:30:16 +0200 Subject: [PATCH 41/42] Shortened the logging on UnsafeContentTypeException. --- src/plugins/Spider/Spider.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/plugins/Spider/Spider.java b/src/plugins/Spider/Spider.java index 23d6c3f..86b4aab 100644 --- a/src/plugins/Spider/Spider.java +++ b/src/plugins/Spider/Spider.java @@ -625,7 +625,7 @@ protected void onSuccess(FetchResult result, ClientGetter state) { * provided). */ PageCallBack pageCallBack = new PageCallBack(page); - Logger.minor(this, "Successful: " + uri + " : " + page.getId()); + Logger.minor(this, "Successful: " + uri + " id=" + page.getId()); try { if ("text/plain".equals(mimeType)) { @@ -653,7 +653,7 @@ protected void onSuccess(FetchResult result, ClientGetter state) { db.endThreadTransaction(); dbTransactionEnded = true; - Logger.minor(this, "UnsafeContentTypeException " + uri + " : " + page.getId(), e); + Logger.minor(this, "" + e + " " + uri + " id=" + page.getId()); return; // Ignore } catch (IOException e) { // ugh? From 76d012db30a364740955483a6e206f5e737cf19c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Debora=20W=C3=B6pcke?= Date: Fri, 30 Sep 2022 20:19:20 +0200 Subject: [PATCH 42/42] Clean out uris if the bad listed list is augmented and fetch fails --- src/plugins/Spider/Spider.java | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/src/plugins/Spider/Spider.java b/src/plugins/Spider/Spider.java index 86b4aab..d48bb4f 100644 --- a/src/plugins/Spider/Spider.java +++ b/src/plugins/Spider/Spider.java @@ -768,8 +768,24 @@ public void run() { // too many tries or fatal, mark as failed page.setStatus(Status.FATALLY_FAILED, "Fatal: " + fe.getMode()); } else { - // requeue at back - page.setStatus(Status.FAILED); + // If uris are already queued that are afterwards rendered "bad" + // by changing the badlisted extensions list, then they are cleaned + // out if they fail. + boolean badListed = false; + String sURI = uri.toString(); + String lowerCaseURI = sURI.toLowerCase(Locale.US); + for (String ext : getRoot().getConfig().getBadlistedExtensions()) { + if (lowerCaseURI.endsWith(ext)) { + badListed = true; + } + } + + if (badListed) { + page.setStatus(Status.FATALLY_FAILED); + } else { + // requeue at back + page.setStatus(Status.FAILED); + } } db.endThreadTransaction(); dbTransactionEnded = true;