public class JsoupHTMLLinkStructureExtractor extends AbstractJsoupExtractor implements WandoraTool, BrowserPluginExtractor
Modifier and Type | Field and Description |
---|---|
private static java.lang.String |
DOC_TYPE |
private static java.lang.String |
LINK_TYPE |
private TopicMap |
tm |
private Topic |
wandoraClass |
CUSTOM_EXTRACTOR, DONE_FAILED, DONE_MANY, DONE_ONE, EXACTLY_GIVEN_URLS, FILE_EXTRACTOR, FILE_PATTERN, GIVEN_URLS_AND_ALL_CRAWLED_DOCUMENTS, GIVEN_URLS_AND_CRAWLED_DOCUMENTS_IN_URL_DOMAIN, GIVEN_URLS_AND_LINKED_DOCUMENTS, GIVEN_URLS_AND_URL_BELOW, INFO_WAIT_WHILE_WORKING, LOG_TITLE, POINT_START_URL_TEXT, RAW_EXTRACTOR, SELECT_DIALOG_TITLE, STRING_EXTRACTOR_NOT_SUPPORTED_MESSAGE, URL_EXTRACTOR
CLOSE, EXECUTE, INVISIBLE, VISIBLE, WAIT
RETURN_ERROR, RETURN_INFO
Constructor and Description |
---|
JsoupHTMLLinkStructureExtractor() |
Modifier and Type | Method and Description |
---|---|
boolean |
extractTopicsFrom(org.jsoup.nodes.Document d,
java.lang.String u,
TopicMap t) |
private void |
parseLink(org.jsoup.nodes.Element link,
Topic docTopic) |
_extractTopicsFrom, _extractTopicsFrom, _extractTopicsFrom, getContentTypes, getIcon, getLangTopic, getOrCreateTopic, getOrCreateTopic, getWandoraClassTopic, makeSubclassOf
acceptBrowserExtractRequest, addCrawlerUrl, browserExtractorConsumesPlainText, buildSI, buildSL, clearMasterSubject, createAssociation, createAssociation, createTopic, createTopic, createTopic, createTopic, createTopic, createTopic, createTopic, croppedFilename, croppedFilename, croppedUrlString, croppedUrlString, doBrowserExtract, dropExtract, dropExtract, dropExtract, execute, extractTopicsFrom, extractTopicsFrom, extractTopicsFrom, extractTopicsFrom, extractTopicsFromText, getBrowserExtractorName, getCrawlerMode, getDescription, getExtractorType, getForceContent, getForceFiles, getForceUrls, getGUIText, getGUIText, getInterruptsHandled, getMasterSubject, getName, getType, getWandora, handle, handleContent, handleCustomType, handleFiles, handleForcedContent, handleInterrupt, handleStringContent, handleUrls, initializeCustomType, instantDropHandle, makeSubclassOfWandoraClass, runInOwnThread, setData, setDisplayName, setForceContent, setForceFiles, setForceUrls, setMasterSubject, setMasterSubject, setTopicMap, setupCrawler, setWandora, takeNap, urlEncode, useTempTopicMap, useURLCrawler
addUndoMarker, addUndoMarker, allowMultipleInvocations, clearAllThreads, clearThreads, clearThreads, clearToolLock, clearToolLock, clearToolLocks, configure, execute, execute, forceStop, forceStop, getContext, getCurrentLogger, getDefaultLogger, getHistory, getLastLogger, getState, getThreads, getThreads, getToolMenuItem, getToolMenuItem, getTopicName, hlog, initialize, interruptAllThreads, interruptThreads, interruptThreads, isConfigurable, isRunning, isRunning, lockLog, log, log, log, log, requiresRefresh, run, setContext, setDefaultLogger, setLogTitle, setProgress, setProgressMax, setState, setToolLogger, singleLog, singleLog, singleLog, solveContextTopicMap, solveNameForTopicMap, writeOptions
clone, equals, finalize, getClass, hashCode, notify, notifyAll, toString, wait, wait, wait
configure, execute, execute, execute, getContext, getDescription, getIcon, getName, getToolMenuItem, getType, hlog, initialize, isConfigurable, isRunning, log, log, log, log, requiresRefresh, setContext, setToolLogger, writeOptions
forceStop, getHistory, getState, lockLog, setLogTitle, setProgress, setProgressMax, setState
acceptBrowserExtractRequest, doBrowserExtract, getBrowserExtractorName
private TopicMap tm
private Topic wandoraClass
private static final java.lang.String LINK_TYPE
private static final java.lang.String DOC_TYPE
public boolean extractTopicsFrom(org.jsoup.nodes.Document d, java.lang.String u, TopicMap t) throws java.lang.Exception
extractTopicsFrom
in class AbstractJsoupExtractor
java.lang.Exception
private void parseLink(org.jsoup.nodes.Element link, Topic docTopic) throws TopicMapException
TopicMapException
Copyright 2004-2015 Wandora Team