public class MobyThesaurusExtractor extends AbstractExtractor implements WandoraTool
Tool reads Moby thesaurus file and converts if to a topic map. Moby thesaurus file is a simple text file where each line defines single word and related words. For example:
word1 relatedWord1 relatedWord2 relatedWord3 relatedWord4 ...
word2 relatedWord1 relatedWord2 relatedWord3 relatedWord4 ...
This extractor creates a topic for each word (including related words) and a binary association for each word-relatedWord pair. If word has four related words then extractor creates four associations. Notice the word may be a related word for some other word, increasing the overall number of associations one word eventually gets.
Moby thesaurus is public domain and can be acquired from http://www.gutenberg.org/etext/3202
As the Moby thesaurus contains hundreds of thousands words Wandora requires at least 2G of memory to extract complete thesaurus.
Modifier and Type | Field and Description |
---|---|
boolean |
ANTISYMMETRIC_ASSOCIATIONS |
java.lang.String |
locatorPrefix |
boolean |
REMOVE_RARE_WORDS |
CUSTOM_EXTRACTOR, DONE_FAILED, DONE_MANY, DONE_ONE, EXACTLY_GIVEN_URLS, FILE_EXTRACTOR, FILE_PATTERN, GIVEN_URLS_AND_ALL_CRAWLED_DOCUMENTS, GIVEN_URLS_AND_CRAWLED_DOCUMENTS_IN_URL_DOMAIN, GIVEN_URLS_AND_LINKED_DOCUMENTS, GIVEN_URLS_AND_URL_BELOW, INFO_WAIT_WHILE_WORKING, LOG_TITLE, POINT_START_URL_TEXT, RAW_EXTRACTOR, SELECT_DIALOG_TITLE, STRING_EXTRACTOR_NOT_SUPPORTED_MESSAGE, URL_EXTRACTOR
CLOSE, EXECUTE, INVISIBLE, VISIBLE, WAIT
RETURN_ERROR, RETURN_INFO
Constructor and Description |
---|
MobyThesaurusExtractor()
Creates a new instance of MobyThesaurusExtractor
|
Modifier and Type | Method and Description |
---|---|
boolean |
_extractTopicsFrom(java.io.BufferedReader breader,
TopicMap topicMap) |
boolean |
_extractTopicsFrom(java.io.File thesaurusFile,
TopicMap topicMap) |
boolean |
_extractTopicsFrom(java.lang.String str,
TopicMap topicMap) |
boolean |
_extractTopicsFrom(java.net.URL url,
TopicMap topicMap) |
boolean |
associationExists(Topic t1,
Topic t2,
Topic at) |
boolean |
browserExtractorConsumesPlainText() |
java.lang.String |
getDescription()
AdminToolManager views tool descriptions while user browses available
tools and build user customizable GUI elements such as Tools menu.
|
java.lang.String |
getGUIText(int textType) |
java.lang.String |
getName()
Tools name represent the tool in UI unless the tool has been given
explicitly another GUI name.
|
Topic |
getOrCreateTopic(TopicMap topicmap,
Locator si,
java.lang.String baseName,
java.lang.String displayName) |
Topic |
getOrCreateTopic(TopicMap topicmap,
Locator si,
java.lang.String baseName,
java.lang.String displayName,
Topic typeTopic) |
Topic |
getOrCreateTopic(TopicMap topicmap,
java.lang.String si,
java.lang.String baseName,
java.lang.String displayName) |
Locator |
makeSI(java.lang.String str) |
boolean |
useTempTopicMap() |
acceptBrowserExtractRequest, addCrawlerUrl, buildSI, buildSL, clearMasterSubject, createAssociation, createAssociation, createTopic, createTopic, createTopic, createTopic, createTopic, createTopic, createTopic, croppedFilename, croppedFilename, croppedUrlString, croppedUrlString, doBrowserExtract, dropExtract, dropExtract, dropExtract, execute, extractTopicsFrom, extractTopicsFrom, extractTopicsFrom, extractTopicsFrom, extractTopicsFromText, getBrowserExtractorName, getContentTypes, getCrawlerMode, getExtractorType, getForceContent, getForceFiles, getForceUrls, getGUIText, getIcon, getInterruptsHandled, getMasterSubject, getType, getWandora, handle, handleContent, handleCustomType, handleFiles, handleForcedContent, handleInterrupt, handleStringContent, handleUrls, initializeCustomType, instantDropHandle, makeSubclassOfWandoraClass, runInOwnThread, setData, setDisplayName, setForceContent, setForceFiles, setForceUrls, setMasterSubject, setMasterSubject, setTopicMap, setupCrawler, setWandora, takeNap, urlEncode, useURLCrawler
addUndoMarker, addUndoMarker, allowMultipleInvocations, clearAllThreads, clearThreads, clearThreads, clearToolLock, clearToolLock, clearToolLocks, configure, execute, execute, forceStop, forceStop, getContext, getCurrentLogger, getDefaultLogger, getHistory, getLastLogger, getState, getThreads, getThreads, getToolMenuItem, getToolMenuItem, getTopicName, hlog, initialize, interruptAllThreads, interruptThreads, interruptThreads, isConfigurable, isRunning, isRunning, lockLog, log, log, log, log, requiresRefresh, run, setContext, setDefaultLogger, setLogTitle, setProgress, setProgressMax, setState, setToolLogger, singleLog, singleLog, singleLog, solveContextTopicMap, solveNameForTopicMap, writeOptions
clone, equals, finalize, getClass, hashCode, notify, notifyAll, toString, wait, wait, wait
configure, execute, execute, execute, getContext, getIcon, getToolMenuItem, getType, hlog, initialize, isConfigurable, isRunning, log, log, log, log, requiresRefresh, setContext, setToolLogger, writeOptions
forceStop, getHistory, getState, lockLog, setLogTitle, setProgress, setProgressMax, setState
public java.lang.String locatorPrefix
public boolean ANTISYMMETRIC_ASSOCIATIONS
public boolean REMOVE_RARE_WORDS
public MobyThesaurusExtractor()
public java.lang.String getName()
AbstractWandoraTool
getName
in interface WandoraTool
getName
in class AbstractExtractor
public java.lang.String getDescription()
AbstractWandoraTool
getDescription
in interface WandoraTool
getDescription
in class AbstractExtractor
public java.lang.String getGUIText(int textType)
getGUIText
in class AbstractExtractor
public boolean browserExtractorConsumesPlainText()
browserExtractorConsumesPlainText
in class AbstractExtractor
public boolean _extractTopicsFrom(java.net.URL url, TopicMap topicMap) throws java.lang.Exception
_extractTopicsFrom
in class AbstractExtractor
java.lang.Exception
public boolean _extractTopicsFrom(java.io.File thesaurusFile, TopicMap topicMap) throws java.lang.Exception
_extractTopicsFrom
in class AbstractExtractor
java.lang.Exception
public boolean _extractTopicsFrom(java.lang.String str, TopicMap topicMap) throws java.lang.Exception
_extractTopicsFrom
in class AbstractExtractor
java.lang.Exception
public boolean _extractTopicsFrom(java.io.BufferedReader breader, TopicMap topicMap) throws java.lang.Exception
java.lang.Exception
public Topic getOrCreateTopic(TopicMap topicmap, java.lang.String si, java.lang.String baseName, java.lang.String displayName)
public Topic getOrCreateTopic(TopicMap topicmap, Locator si, java.lang.String baseName, java.lang.String displayName)
public Topic getOrCreateTopic(TopicMap topicmap, Locator si, java.lang.String baseName, java.lang.String displayName, Topic typeTopic)
public Locator makeSI(java.lang.String str)
public boolean useTempTopicMap()
useTempTopicMap
in class AbstractExtractor
Copyright 2004-2015 Wandora Team