public class MediaWikiAPIPageExtractor extends AbstractMediaWikiAPIExtractor
Modifier and Type | Field and Description |
---|---|
private java.lang.String |
baseURL |
private java.lang.String[] |
contentTypes |
private boolean |
crawlClasses |
private WandoraToolLogger |
logger |
private int |
nExtracted |
private int |
progress |
private java.lang.String[] |
qType |
private java.lang.String |
queryURL |
CONTENT_TYPE_SI, LANG_SI, PAGE_SI, SI_ROOT
CUSTOM_EXTRACTOR, DONE_FAILED, DONE_MANY, DONE_ONE, EXACTLY_GIVEN_URLS, FILE_EXTRACTOR, FILE_PATTERN, GIVEN_URLS_AND_ALL_CRAWLED_DOCUMENTS, GIVEN_URLS_AND_CRAWLED_DOCUMENTS_IN_URL_DOMAIN, GIVEN_URLS_AND_LINKED_DOCUMENTS, GIVEN_URLS_AND_URL_BELOW, INFO_WAIT_WHILE_WORKING, LOG_TITLE, POINT_START_URL_TEXT, RAW_EXTRACTOR, SELECT_DIALOG_TITLE, STRING_EXTRACTOR_NOT_SUPPORTED_MESSAGE, URL_EXTRACTOR
CLOSE, EXECUTE, INVISIBLE, VISIBLE, WAIT
RETURN_ERROR, RETURN_INFO
Constructor and Description |
---|
MediaWikiAPIPageExtractor(java.lang.String baseURL,
java.lang.String[] qType,
boolean crawl) |
Modifier and Type | Method and Description |
---|---|
boolean |
_extractTopicsFrom(java.io.File f,
TopicMap t) |
boolean |
_extractTopicsFrom(java.lang.String str,
TopicMap t) |
boolean |
_extractTopicsFrom(java.net.URL u,
TopicMap t) |
private void |
continueExtraction(org.wandora.dep.json.JSONObject contObject,
TopicMap t) |
private boolean |
extractTopicsFromString(java.lang.String str,
TopicMap t) |
private boolean |
extractTopicsFromURL(java.net.URL u,
TopicMap t) |
private java.lang.String |
getArticleBody(java.lang.String title) |
private java.util.List<java.lang.String> |
getArticleClasses(java.lang.String title) |
private java.util.HashMap<java.lang.String,java.lang.String> |
getArticleInfo(java.lang.String title) |
protected java.lang.String |
getBaseUrl() |
java.lang.String[] |
getContentTypes()
Returns an array of String containing the content-types this
ContentHandler can process. |
protected java.lang.String |
getQueryUrl() |
protected void |
incrementExtractions() |
private org.wandora.dep.json.JSONObject |
parse(org.wandora.dep.json.JSONObject body,
TopicMap tm) |
private void |
parsePage(org.wandora.dep.json.JSONObject page,
TopicMap tm) |
private void |
parsePage(java.lang.String title,
TopicMap tm) |
private void |
printError(org.wandora.dep.json.JSONObject body) |
private void |
printWarnings(org.wandora.dep.json.JSONObject body) |
boolean |
runInOwnThread()
Whether or not this tool should fork own thread.
|
protected void |
setQueryUrl(java.lang.String u) |
boolean |
useURLCrawler() |
getContentTypeTopic, getLangTopic, getLangTopic, getMediaWikiClass, getOrCreateTopic, getOrCreateTopic, getWandoraClassTopic, makeSubclassOf
acceptBrowserExtractRequest, addCrawlerUrl, browserExtractorConsumesPlainText, buildSI, buildSL, clearMasterSubject, createAssociation, createAssociation, createTopic, createTopic, createTopic, createTopic, createTopic, createTopic, createTopic, croppedFilename, croppedFilename, croppedUrlString, croppedUrlString, doBrowserExtract, dropExtract, dropExtract, dropExtract, execute, extractTopicsFrom, extractTopicsFrom, extractTopicsFrom, extractTopicsFrom, extractTopicsFromText, getBrowserExtractorName, getCrawlerMode, getDescription, getExtractorType, getForceContent, getForceFiles, getForceUrls, getGUIText, getGUIText, getIcon, getInterruptsHandled, getMasterSubject, getName, getType, getWandora, handle, handleContent, handleCustomType, handleFiles, handleForcedContent, handleInterrupt, handleStringContent, handleUrls, initializeCustomType, instantDropHandle, makeSubclassOfWandoraClass, setData, setDisplayName, setForceContent, setForceFiles, setForceUrls, setMasterSubject, setMasterSubject, setTopicMap, setupCrawler, setWandora, takeNap, urlEncode, useTempTopicMap
addUndoMarker, addUndoMarker, allowMultipleInvocations, clearAllThreads, clearThreads, clearThreads, clearToolLock, clearToolLock, clearToolLocks, configure, execute, execute, forceStop, forceStop, getContext, getCurrentLogger, getDefaultLogger, getHistory, getLastLogger, getState, getThreads, getThreads, getToolMenuItem, getToolMenuItem, getTopicName, hlog, initialize, interruptAllThreads, interruptThreads, interruptThreads, isConfigurable, isRunning, isRunning, lockLog, log, log, log, log, requiresRefresh, run, setContext, setDefaultLogger, setLogTitle, setProgress, setProgressMax, setState, setToolLogger, singleLog, singleLog, singleLog, solveContextTopicMap, solveNameForTopicMap, writeOptions
clone, equals, finalize, getClass, hashCode, notify, notifyAll, toString, wait, wait, wait
configure, execute, execute, getContext, getToolMenuItem, hlog, initialize, isConfigurable, isRunning, log, log, log, log, requiresRefresh, setContext, setToolLogger, writeOptions
forceStop, getHistory, getState, lockLog, setLogTitle, setProgress, setProgressMax, setState
private int nExtracted
private java.lang.String baseURL
private java.lang.String queryURL
private boolean crawlClasses
private java.lang.String[] qType
private int progress
private WandoraToolLogger logger
private final java.lang.String[] contentTypes
MediaWikiAPIPageExtractor(java.lang.String baseURL, java.lang.String[] qType, boolean crawl)
public boolean useURLCrawler()
useURLCrawler
in class AbstractExtractor
public boolean runInOwnThread()
AbstractWandoraTool
runInOwnThread
in class AbstractExtractor
protected void setQueryUrl(java.lang.String u)
protected java.lang.String getBaseUrl()
protected java.lang.String getQueryUrl()
protected void incrementExtractions()
public java.lang.String[] getContentTypes()
Handler
ContentHandler
can process.getContentTypes
in interface Handler
getContentTypes
in class AbstractExtractor
public boolean _extractTopicsFrom(java.io.File f, TopicMap t) throws java.lang.Exception
_extractTopicsFrom
in class AbstractExtractor
java.lang.Exception
public boolean _extractTopicsFrom(java.net.URL u, TopicMap t) throws java.lang.Exception
_extractTopicsFrom
in class AbstractExtractor
java.lang.Exception
public boolean _extractTopicsFrom(java.lang.String str, TopicMap t) throws java.lang.Exception
_extractTopicsFrom
in class AbstractExtractor
java.lang.Exception
private boolean extractTopicsFromString(java.lang.String str, TopicMap t)
private boolean extractTopicsFromURL(java.net.URL u, TopicMap t)
private void continueExtraction(org.wandora.dep.json.JSONObject contObject, TopicMap t) throws java.lang.Exception
java.lang.Exception
private org.wandora.dep.json.JSONObject parse(org.wandora.dep.json.JSONObject body, TopicMap tm) throws org.wandora.dep.json.JSONException, TopicMapException, java.io.IOException
org.wandora.dep.json.JSONException
TopicMapException
java.io.IOException
private void parsePage(org.wandora.dep.json.JSONObject page, TopicMap tm) throws org.wandora.dep.json.JSONException, TopicMapException, java.io.IOException
org.wandora.dep.json.JSONException
TopicMapException
java.io.IOException
private void parsePage(java.lang.String title, TopicMap tm) throws org.wandora.dep.json.JSONException, TopicMapException, java.io.IOException
org.wandora.dep.json.JSONException
TopicMapException
java.io.IOException
private java.lang.String getArticleBody(java.lang.String title) throws java.io.IOException
java.io.IOException
private java.util.HashMap<java.lang.String,java.lang.String> getArticleInfo(java.lang.String title) throws java.io.IOException
java.io.IOException
private java.util.List<java.lang.String> getArticleClasses(java.lang.String title) throws java.io.IOException
java.io.IOException
private void printError(org.wandora.dep.json.JSONObject body) throws org.wandora.dep.json.JSONException
org.wandora.dep.json.JSONException
private void printWarnings(org.wandora.dep.json.JSONObject body) throws org.wandora.dep.json.JSONException
org.wandora.dep.json.JSONException
Copyright 2004-2015 Wandora Team