public abstract class AbstractExtractor extends AbstractWandoraTool implements WandoraTool, java.lang.Runnable, Handler, InterruptHandler, DropExtractor, BrowserPluginExtractor
CLOSE, EXECUTE, INVISIBLE, VISIBLE, WAIT
RETURN_ERROR, RETURN_INFO
Constructor and Description |
---|
AbstractExtractor()
Creates a new instance of AbstractExtractor
|
Modifier and Type | Method and Description |
---|---|
abstract boolean |
_extractTopicsFrom(java.io.File f,
TopicMap t) |
abstract boolean |
_extractTopicsFrom(java.lang.String str,
TopicMap t) |
abstract boolean |
_extractTopicsFrom(java.net.URL u,
TopicMap t) |
boolean |
acceptBrowserExtractRequest(BrowserExtractRequest request,
Wandora wandora) |
void |
addCrawlerUrl(java.net.URL url,
int depth) |
boolean |
browserExtractorConsumesPlainText() |
Locator |
buildSI(java.lang.String siend) |
Locator |
buildSL(java.io.File file) |
void |
clearMasterSubject() |
Association |
createAssociation(TopicMap topicMap,
Topic aType,
Topic[] players) |
Association |
createAssociation(TopicMap topicMap,
Topic aType,
Topic[] players,
Topic[] roles) |
Topic |
createTopic(TopicMap topicMap,
java.lang.String baseString) |
Topic |
createTopic(TopicMap topicMap,
java.lang.String siString,
java.lang.String baseString) |
Topic |
createTopic(TopicMap topicMap,
java.lang.String siString,
java.lang.String baseNameString,
java.lang.String baseString) |
Topic |
createTopic(TopicMap topicMap,
java.lang.String siString,
java.lang.String baseNameString,
java.lang.String baseString,
Topic type) |
Topic |
createTopic(TopicMap topicMap,
java.lang.String siString,
java.lang.String baseNameString,
java.lang.String baseString,
Topic[] types) |
Topic |
createTopic(TopicMap topicMap,
java.lang.String siString,
java.lang.String baseString,
Topic type) |
Topic |
createTopic(TopicMap topicMap,
java.lang.String baseString,
Topic type) |
protected java.lang.String |
croppedFilename(java.io.File file) |
protected java.lang.String |
croppedFilename(java.lang.String filename) |
protected java.lang.String |
croppedUrlString(java.lang.String urlString) |
protected java.lang.String |
croppedUrlString(java.net.URL url) |
java.lang.String |
doBrowserExtract(BrowserExtractRequest request,
Wandora wandora) |
void |
dropExtract(java.io.File[] files) |
void |
dropExtract(java.lang.String content) |
void |
dropExtract(java.lang.String[] urls) |
void |
execute(Wandora admin,
Context context)
Runs the tool.
|
void |
extractTopicsFrom(java.io.File file,
TopicMap topicMap) |
int |
extractTopicsFrom(java.lang.String fileName,
java.util.Collection visited,
java.util.regex.Pattern fileMask,
int depth,
int space) |
int |
extractTopicsFrom(java.lang.String fileName,
java.util.regex.Pattern fileMask,
int depth,
int space) |
void |
extractTopicsFrom(java.net.URL url,
TopicMap topicMap) |
void |
extractTopicsFromText(java.lang.String content,
TopicMap topicMap) |
java.lang.String |
getBrowserExtractorName() |
java.lang.String[] |
getContentTypes()
Returns an array of String containing the content-types this
ContentHandler can process. |
int |
getCrawlerMode() |
java.lang.String |
getDescription()
AdminToolManager views tool descriptions while user browses available
tools and build user customizable GUI elements such as Tools menu.
|
int |
getExtractorType() |
java.lang.Object |
getForceContent() |
java.io.File[] |
getForceFiles() |
java.lang.String[] |
getForceUrls() |
java.lang.String |
getGUIText(int textType) |
java.lang.String |
getGUIText(int textType,
java.lang.Object[] params) |
javax.swing.Icon |
getIcon()
All tools may have identifying graphic icon used within tool GUI elements.
|
int[] |
getInterruptsHandled() |
java.lang.String |
getMasterSubject() |
java.lang.String |
getName()
Tools name represent the tool in UI unless the tool has been given
explicitly another GUI name.
|
WandoraToolType |
getType()
Tool type is used to categorize tools.
|
Wandora |
getWandora() |
void |
handle(CrawlerAccess crawler,
java.io.InputStream in,
int depth,
java.net.URL url)
Processes the given page.
|
void |
handleContent(java.lang.Object content,
TopicMap tm) |
void |
handleCustomType() |
void |
handleFiles(java.io.File[] files,
TopicMap tm) |
boolean |
handleForcedContent() |
void |
handleInterrupt(CrawlerAccess crawler,
int interrupt,
java.net.URL url) |
void |
handleStringContent(java.lang.String stringContent,
TopicMap tm) |
void |
handleUrls(java.lang.String[] urls,
TopicMap tm) |
void |
initializeCustomType() |
boolean |
instantDropHandle() |
void |
makeSubclassOfWandoraClass(Topic t,
TopicMap tm) |
boolean |
runInOwnThread()
Whether or not this tool should fork own thread.
|
void |
setData(Topic t,
Topic type,
java.lang.String lang,
java.lang.String text) |
void |
setDisplayName(Topic t,
java.lang.String lang,
java.lang.String name) |
void |
setForceContent(java.lang.Object fcontent) |
void |
setForceFiles(java.io.File[] ffiles) |
void |
setForceUrls(java.lang.String[] furls) |
void |
setMasterSubject(java.lang.String subject) |
void |
setMasterSubject(Topic t) |
void |
setTopicMap(TopicMap tm) |
void |
setupCrawler(java.lang.String startUrl) |
private void |
setupCrawler(java.lang.String[] startUrls) |
void |
setWandora(Wandora app) |
protected void |
takeNap(long napTime) |
protected static java.lang.String |
urlEncode(java.lang.String str) |
boolean |
useTempTopicMap() |
boolean |
useURLCrawler() |
addUndoMarker, addUndoMarker, allowMultipleInvocations, clearAllThreads, clearThreads, clearThreads, clearToolLock, clearToolLock, clearToolLocks, configure, execute, execute, forceStop, forceStop, getContext, getCurrentLogger, getDefaultLogger, getHistory, getLastLogger, getState, getThreads, getThreads, getToolMenuItem, getToolMenuItem, getTopicName, hlog, initialize, interruptAllThreads, interruptThreads, interruptThreads, isConfigurable, isRunning, isRunning, lockLog, log, log, log, log, requiresRefresh, run, setContext, setDefaultLogger, setLogTitle, setProgress, setProgressMax, setState, setToolLogger, singleLog, singleLog, singleLog, solveContextTopicMap, solveNameForTopicMap, writeOptions
clone, equals, finalize, getClass, hashCode, notify, notifyAll, toString, wait, wait, wait
configure, execute, execute, getContext, getToolMenuItem, hlog, initialize, isConfigurable, isRunning, log, log, log, log, requiresRefresh, setContext, setToolLogger, writeOptions
forceStop, getHistory, getState, lockLog, setLogTitle, setProgress, setProgressMax, setState
public static final java.lang.String STRING_EXTRACTOR_NOT_SUPPORTED_MESSAGE
public static final int CUSTOM_EXTRACTOR
public static final int RAW_EXTRACTOR
public static final int FILE_EXTRACTOR
public static final int URL_EXTRACTOR
public static final int EXACTLY_GIVEN_URLS
public static final int GIVEN_URLS_AND_LINKED_DOCUMENTS
public static final int GIVEN_URLS_AND_URL_BELOW
public static final int GIVEN_URLS_AND_CRAWLED_DOCUMENTS_IN_URL_DOMAIN
public static final int GIVEN_URLS_AND_ALL_CRAWLED_DOCUMENTS
public static final int SELECT_DIALOG_TITLE
public static final int POINT_START_URL_TEXT
public static final int INFO_WAIT_WHILE_WORKING
public static final int FILE_PATTERN
public static final int DONE_FAILED
public static final int DONE_ONE
public static final int DONE_MANY
public static final int LOG_TITLE
private Wandora wandora
private TopicMap topicMap
private java.lang.String[] forceUrls
private java.io.File[] forceFiles
private java.lang.Object forceContent
private WebCrawler crawler
private int maximumCrawlCounter
private int extractionCounter
private int foundCounter
private int browseCounter
private long errorNapTime
private AbstractExtractorDialog extractorSourceDialog
private java.lang.String masterSubject
private final java.lang.String[] contentTypes
public AbstractExtractor()
public WandoraToolType getType()
AbstractWandoraTool
Tool type is used to categorize tools. Tool type has no real effect today, it is merely an informative property of a tool.
getType
in interface WandoraTool
getType
in class AbstractWandoraTool
public java.lang.String getName()
AbstractWandoraTool
getName
in interface WandoraTool
getName
in class AbstractWandoraTool
public java.lang.String getDescription()
AbstractWandoraTool
getDescription
in interface WandoraTool
getDescription
in class AbstractWandoraTool
public javax.swing.Icon getIcon()
AbstractWandoraTool
getIcon
should return Icon
object of
the tool.getIcon
in interface WandoraTool
getIcon
in class AbstractWandoraTool
public boolean runInOwnThread()
AbstractWandoraTool
runInOwnThread
in class AbstractWandoraTool
public boolean useTempTopicMap()
public boolean useURLCrawler()
public boolean instantDropHandle()
public void dropExtract(java.io.File[] files) throws TopicMapException
dropExtract
in interface DropExtractor
TopicMapException
public void dropExtract(java.lang.String[] urls) throws TopicMapException
dropExtract
in interface DropExtractor
TopicMapException
public void dropExtract(java.lang.String content) throws TopicMapException
dropExtract
in interface DropExtractor
TopicMapException
public java.lang.String getGUIText(int textType, java.lang.Object[] params)
public java.lang.String getGUIText(int textType)
public void setForceFiles(java.io.File[] ffiles)
public void setForceUrls(java.lang.String[] furls)
public void setForceContent(java.lang.Object fcontent)
public java.io.File[] getForceFiles()
public java.lang.String[] getForceUrls()
public java.lang.Object getForceContent()
public int getExtractorType()
public void initializeCustomType()
public void handleCustomType()
public void execute(Wandora admin, Context context)
WandoraTool
execute
in interface WandoraTool
public boolean handleForcedContent()
public void handleFiles(java.io.File[] files, TopicMap tm)
public void handleUrls(java.lang.String[] urls, TopicMap tm)
public void handleContent(java.lang.Object content, TopicMap tm)
public void handleStringContent(java.lang.String stringContent, TopicMap tm)
public void setTopicMap(TopicMap tm)
private void setupCrawler(java.lang.String[] startUrls)
public void setupCrawler(java.lang.String startUrl)
public void addCrawlerUrl(java.net.URL url, int depth)
public int getCrawlerMode()
protected void takeNap(long napTime)
public int extractTopicsFrom(java.lang.String fileName, java.util.regex.Pattern fileMask, int depth, int space)
public int extractTopicsFrom(java.lang.String fileName, java.util.Collection visited, java.util.regex.Pattern fileMask, int depth, int space)
public void extractTopicsFrom(java.net.URL url, TopicMap topicMap) throws java.lang.Exception
java.lang.Exception
public void extractTopicsFromText(java.lang.String content, TopicMap topicMap) throws java.lang.Exception
java.lang.Exception
public void extractTopicsFrom(java.io.File file, TopicMap topicMap) throws java.lang.Exception
java.lang.Exception
public Locator buildSL(java.io.File file)
public abstract boolean _extractTopicsFrom(java.io.File f, TopicMap t) throws java.lang.Exception
java.lang.Exception
public abstract boolean _extractTopicsFrom(java.net.URL u, TopicMap t) throws java.lang.Exception
java.lang.Exception
public abstract boolean _extractTopicsFrom(java.lang.String str, TopicMap t) throws java.lang.Exception
java.lang.Exception
protected static java.lang.String urlEncode(java.lang.String str)
public Locator buildSI(java.lang.String siend)
public Topic createTopic(TopicMap topicMap, java.lang.String baseString) throws TopicMapException
TopicMapException
public Topic createTopic(TopicMap topicMap, java.lang.String siString, java.lang.String baseString) throws TopicMapException
TopicMapException
public Topic createTopic(TopicMap topicMap, java.lang.String siString, java.lang.String baseString, Topic type) throws TopicMapException
TopicMapException
public Topic createTopic(TopicMap topicMap, java.lang.String baseString, Topic type) throws TopicMapException
TopicMapException
public Topic createTopic(TopicMap topicMap, java.lang.String siString, java.lang.String baseNameString, java.lang.String baseString) throws TopicMapException
TopicMapException
public Topic createTopic(TopicMap topicMap, java.lang.String siString, java.lang.String baseNameString, java.lang.String baseString, Topic type) throws TopicMapException
TopicMapException
public Topic createTopic(TopicMap topicMap, java.lang.String siString, java.lang.String baseNameString, java.lang.String baseString, Topic[] types) throws TopicMapException
TopicMapException
public Association createAssociation(TopicMap topicMap, Topic aType, Topic[] players) throws TopicMapException
TopicMapException
public Association createAssociation(TopicMap topicMap, Topic aType, Topic[] players, Topic[] roles) throws TopicMapException
TopicMapException
public void setDisplayName(Topic t, java.lang.String lang, java.lang.String name) throws TopicMapException
TopicMapException
public void setData(Topic t, Topic type, java.lang.String lang, java.lang.String text) throws TopicMapException
TopicMapException
public void makeSubclassOfWandoraClass(Topic t, TopicMap tm) throws TopicMapException
TopicMapException
protected java.lang.String croppedFilename(java.lang.String filename)
protected java.lang.String croppedFilename(java.io.File file)
protected java.lang.String croppedUrlString(java.net.URL url)
protected java.lang.String croppedUrlString(java.lang.String urlString)
public void handle(CrawlerAccess crawler, java.io.InputStream in, int depth, java.net.URL url)
Handler
InputStream
contains the data of an object that is
of the content-type this content handler accepts. May use the given
CrawlerAccess
object to add further pages to the queue of the
WebCrawler
that asked to process the page.handle
in interface Handler
crawler
- The call back object for the handler. Any objects built from
the content of the page can be sent to this.in
- The InputStream
of the page.depth
- The depth remaining depth. When reporting another page to
the queue, the depth of that page should be set to this depth-1.url
- The URL
of the page.public java.lang.String[] getContentTypes()
Handler
ContentHandler
can process.getContentTypes
in interface Handler
public void handleInterrupt(CrawlerAccess crawler, int interrupt, java.net.URL url)
handleInterrupt
in interface InterruptHandler
public int[] getInterruptsHandled()
getInterruptsHandled
in interface InterruptHandler
public java.lang.String doBrowserExtract(BrowserExtractRequest request, Wandora wandora) throws TopicMapException
doBrowserExtract
in interface BrowserPluginExtractor
TopicMapException
public boolean acceptBrowserExtractRequest(BrowserExtractRequest request, Wandora wandora) throws TopicMapException
acceptBrowserExtractRequest
in interface BrowserPluginExtractor
TopicMapException
public java.lang.String getBrowserExtractorName()
getBrowserExtractorName
in interface BrowserPluginExtractor
public boolean browserExtractorConsumesPlainText()
public java.lang.String getMasterSubject()
public void setMasterSubject(Topic t)
public void setMasterSubject(java.lang.String subject)
public void clearMasterSubject()
public void setWandora(Wandora app)
public Wandora getWandora()
Copyright 2004-2015 Wandora Team