public class MarcXMLExtractor extends AbstractExtractor
Modifier and Type | Class and Description |
---|---|
private class |
MarcXMLExtractor.MarcXMLParser |
Modifier and Type | Field and Description |
---|---|
private java.lang.String[] |
authorCodes |
static java.lang.String |
BASENAME_PATTERN |
private java.util.ArrayList<java.lang.String> |
basenamePatterns |
private java.lang.String[] |
contentTypes |
static boolean |
CONVERT_LEADERS |
protected static java.lang.String |
DATA_SI |
private static java.lang.String |
defaultEncoding |
private static java.lang.String |
defaultLang |
static java.lang.String |
EXCLUDE_FIELDS |
static java.lang.String |
EXCLUDE_SUBFIELDS |
private java.util.HashMap |
excludeFields |
private java.util.HashMap |
excludeSubfields |
protected static java.lang.String |
FIELD_SI |
protected static java.lang.String |
FIELD_SI_TEMPLATE |
static java.lang.String |
INCLUDE_FIELDS |
static boolean |
INCLUDE_INDX_IN_ASSOCIATIONS |
static java.lang.String |
INCLUDE_SUBFIELDS |
private java.util.HashMap |
includeFields |
private java.util.HashMap |
includeSubfields |
protected static java.lang.String |
IND_SI |
protected static java.lang.String |
LEADER_SI |
protected static java.lang.String |
MARC_SI |
protected static java.lang.String |
RECORD_SI |
static java.lang.String |
RECORD_SI_PATTERN |
private java.util.ArrayList<java.lang.String> |
recordSIPatterns |
static boolean |
SOLVE_FIELD_NAMES |
static boolean |
SOLVE_SUBFIELD_NAMES |
protected static java.lang.String |
SUBFIELDCODE_SI |
private java.lang.String[] |
titleCodes |
static boolean |
TRIM_DATAS |
CUSTOM_EXTRACTOR, DONE_FAILED, DONE_MANY, DONE_ONE, EXACTLY_GIVEN_URLS, FILE_EXTRACTOR, FILE_PATTERN, GIVEN_URLS_AND_ALL_CRAWLED_DOCUMENTS, GIVEN_URLS_AND_CRAWLED_DOCUMENTS_IN_URL_DOMAIN, GIVEN_URLS_AND_LINKED_DOCUMENTS, GIVEN_URLS_AND_URL_BELOW, INFO_WAIT_WHILE_WORKING, LOG_TITLE, POINT_START_URL_TEXT, RAW_EXTRACTOR, SELECT_DIALOG_TITLE, STRING_EXTRACTOR_NOT_SUPPORTED_MESSAGE, URL_EXTRACTOR
CLOSE, EXECUTE, INVISIBLE, VISIBLE, WAIT
RETURN_ERROR, RETURN_INFO
Constructor and Description |
---|
MarcXMLExtractor()
Creates a new instance of MarcXMLExtractor
|
Modifier and Type | Method and Description |
---|---|
boolean |
_extractTopicsFrom(java.io.File file,
TopicMap topicMap) |
boolean |
_extractTopicsFrom(org.xml.sax.InputSource in,
TopicMap topicMap) |
boolean |
_extractTopicsFrom(java.io.InputStream in,
TopicMap topicMap) |
boolean |
_extractTopicsFrom(java.lang.String str,
TopicMap topicMap) |
boolean |
_extractTopicsFrom(java.net.URL url,
TopicMap topicMap) |
void |
configure(Wandora admin,
Options options,
java.lang.String prefix)
If the tool is configurable, shows an user interface to configure the tool.
|
java.lang.String[] |
getContentTypes()
Returns an array of String containing the content-types this
ContentHandler can process. |
Topic |
getDataType(TopicMap tm) |
java.lang.String |
getDescription()
AdminToolManager views tool descriptions while user browses available
tools and build user customizable GUI elements such as Tools menu.
|
Topic |
getFieldTopic(java.lang.String field,
TopicMap tm) |
Topic |
getFieldType(TopicMap tm) |
javax.swing.Icon |
getIcon()
All tools may have identifying graphic icon used within tool GUI elements.
|
Topic |
getInd1Topic(java.lang.String ind1,
java.lang.String tag,
TopicMap tm) |
Topic |
getInd1Type(java.lang.String tag,
TopicMap tm) |
Topic |
getInd2Topic(java.lang.String ind2,
java.lang.String tag,
TopicMap tm) |
Topic |
getInd2Type(java.lang.String tag,
TopicMap tm) |
java.lang.String |
getIndicatorName(java.lang.String field,
java.lang.String indicatorId,
java.lang.String value) |
java.lang.String |
getIndicatorValueName(java.lang.String field,
java.lang.String indicatorId,
java.lang.String value) |
Topic |
getIndType(TopicMap tm) |
Topic |
getLeaderType(TopicMap tm) |
Topic |
getMARCClass(TopicMap tm) |
java.lang.String |
getName()
Tools name represent the tool in UI unless the tool has been given
explicitly another GUI name.
|
protected Topic |
getOrCreateTopic(TopicMap tm,
java.lang.String si) |
protected Topic |
getOrCreateTopic(TopicMap tm,
java.lang.String si,
java.lang.String bn) |
protected Topic |
getOrCreateTopic(TopicMap tm,
java.lang.String si,
java.lang.String bn,
Topic type) |
Topic |
getRecordType(TopicMap tm) |
Topic |
getSubFieldCodeTopic(java.lang.String subfied,
int counter,
java.lang.String field,
java.lang.String ind1Modifier,
java.lang.String ind2Modifier,
TopicMap tm) |
Topic |
getSubFieldCodeTopic(java.lang.String subfied,
java.lang.String field,
java.lang.String ind1Modifier,
java.lang.String ind2Modifier,
TopicMap tm) |
Topic |
getSubFieldCodeType(TopicMap tm) |
Topic |
getSubFieldDataTopic(java.lang.String data,
java.lang.String tagModifier,
java.lang.String ind1Modifier,
java.lang.String ind2Modifier,
TopicMap tm) |
Topic |
getTopic(TopicMap tm,
java.lang.String str,
java.lang.String SIBase,
Topic type) |
Topic |
getWandoraClass(TopicMap tm) |
boolean |
isConfigurable()
Whether this tool is configurable.
|
protected java.lang.String |
makeFieldSI(java.lang.String field) |
protected java.lang.String |
makeSI(java.lang.String base,
java.lang.String endPoint) |
protected void |
makeSubclassOf(TopicMap tm,
Topic t,
Topic superclass) |
void |
parseBasenamePatterns(java.lang.String patterns) |
private java.util.HashMap |
parseFieldCodes(java.lang.String str) |
void |
parseSIPatterns(java.lang.String patterns) |
java.util.HashMap |
parseSubfieldCodes(java.lang.String str) |
void |
processControlField(java.lang.String field,
java.lang.String data,
Topic record,
Topic type,
TopicMap tm) |
protected void |
topicalize(java.lang.String leader,
java.util.ArrayList<MarcField> datafields,
java.util.HashMap<java.lang.String,java.lang.String> controlfields,
java.util.ArrayList<java.lang.String> subjectIdentifiers,
java.util.ArrayList<java.lang.String> basenames,
TopicMap tm) |
boolean |
useURLCrawler() |
acceptBrowserExtractRequest, addCrawlerUrl, browserExtractorConsumesPlainText, buildSI, buildSL, clearMasterSubject, createAssociation, createAssociation, createTopic, createTopic, createTopic, createTopic, createTopic, createTopic, createTopic, croppedFilename, croppedFilename, croppedUrlString, croppedUrlString, doBrowserExtract, dropExtract, dropExtract, dropExtract, execute, extractTopicsFrom, extractTopicsFrom, extractTopicsFrom, extractTopicsFrom, extractTopicsFromText, getBrowserExtractorName, getCrawlerMode, getExtractorType, getForceContent, getForceFiles, getForceUrls, getGUIText, getGUIText, getInterruptsHandled, getMasterSubject, getType, getWandora, handle, handleContent, handleCustomType, handleFiles, handleForcedContent, handleInterrupt, handleStringContent, handleUrls, initializeCustomType, instantDropHandle, makeSubclassOfWandoraClass, runInOwnThread, setData, setDisplayName, setForceContent, setForceFiles, setForceUrls, setMasterSubject, setMasterSubject, setTopicMap, setupCrawler, setWandora, takeNap, urlEncode, useTempTopicMap
addUndoMarker, addUndoMarker, allowMultipleInvocations, clearAllThreads, clearThreads, clearThreads, clearToolLock, clearToolLock, clearToolLocks, execute, execute, forceStop, forceStop, getContext, getCurrentLogger, getDefaultLogger, getHistory, getLastLogger, getState, getThreads, getThreads, getToolMenuItem, getToolMenuItem, getTopicName, hlog, initialize, interruptAllThreads, interruptThreads, interruptThreads, isRunning, isRunning, lockLog, log, log, log, log, requiresRefresh, run, setContext, setDefaultLogger, setLogTitle, setProgress, setProgressMax, setState, setToolLogger, singleLog, singleLog, singleLog, solveContextTopicMap, solveNameForTopicMap, writeOptions
clone, equals, finalize, getClass, hashCode, notify, notifyAll, toString, wait, wait, wait
execute, execute, getContext, getToolMenuItem, hlog, initialize, isRunning, log, log, log, log, requiresRefresh, setContext, setToolLogger, writeOptions
forceStop, getHistory, getState, lockLog, setLogTitle, setProgress, setProgressMax, setState
public static boolean TRIM_DATAS
public static boolean INCLUDE_INDX_IN_ASSOCIATIONS
public static boolean SOLVE_FIELD_NAMES
public static boolean SOLVE_SUBFIELD_NAMES
public static boolean CONVERT_LEADERS
public static java.lang.String EXCLUDE_FIELDS
public static java.lang.String INCLUDE_FIELDS
public static java.lang.String EXCLUDE_SUBFIELDS
public static java.lang.String INCLUDE_SUBFIELDS
public static java.lang.String RECORD_SI_PATTERN
public static java.lang.String BASENAME_PATTERN
protected static java.lang.String MARC_SI
protected static java.lang.String IND_SI
protected static java.lang.String SUBFIELDCODE_SI
protected static java.lang.String LEADER_SI
protected static java.lang.String FIELD_SI
protected static java.lang.String FIELD_SI_TEMPLATE
protected static java.lang.String DATA_SI
protected static java.lang.String RECORD_SI
private static java.lang.String defaultEncoding
private static java.lang.String defaultLang
private java.util.HashMap excludeFields
private java.util.HashMap includeFields
private java.util.HashMap excludeSubfields
private java.util.HashMap includeSubfields
private java.util.ArrayList<java.lang.String> recordSIPatterns
private java.util.ArrayList<java.lang.String> basenamePatterns
private final java.lang.String[] contentTypes
private java.lang.String[] titleCodes
private java.lang.String[] authorCodes
public MarcXMLExtractor()
public java.lang.String getName()
AbstractWandoraTool
getName
in interface WandoraTool
getName
in class AbstractExtractor
public java.lang.String getDescription()
AbstractWandoraTool
getDescription
in interface WandoraTool
getDescription
in class AbstractExtractor
public javax.swing.Icon getIcon()
AbstractWandoraTool
getIcon
should return Icon
object of
the tool.getIcon
in interface WandoraTool
getIcon
in class AbstractExtractor
public java.lang.String[] getContentTypes()
Handler
ContentHandler
can process.getContentTypes
in interface Handler
getContentTypes
in class AbstractExtractor
public boolean useURLCrawler()
useURLCrawler
in class AbstractExtractor
public boolean isConfigurable()
AbstractWandoraTool
isConfigurable
in interface WandoraTool
isConfigurable
in class AbstractWandoraTool
public void configure(Wandora admin, Options options, java.lang.String prefix) throws TopicMapException
AbstractWandoraTool
configure
in interface WandoraTool
configure
in class AbstractWandoraTool
TopicMapException
private java.util.HashMap parseFieldCodes(java.lang.String str)
public java.util.HashMap parseSubfieldCodes(java.lang.String str)
public void parseSIPatterns(java.lang.String patterns)
public void parseBasenamePatterns(java.lang.String patterns)
public boolean _extractTopicsFrom(java.net.URL url, TopicMap topicMap) throws java.lang.Exception
_extractTopicsFrom
in class AbstractExtractor
java.lang.Exception
public boolean _extractTopicsFrom(java.io.File file, TopicMap topicMap) throws java.lang.Exception
_extractTopicsFrom
in class AbstractExtractor
java.lang.Exception
public boolean _extractTopicsFrom(java.lang.String str, TopicMap topicMap) throws java.lang.Exception
_extractTopicsFrom
in class AbstractExtractor
java.lang.Exception
public boolean _extractTopicsFrom(java.io.InputStream in, TopicMap topicMap) throws java.lang.Exception
java.lang.Exception
public boolean _extractTopicsFrom(org.xml.sax.InputSource in, TopicMap topicMap) throws java.lang.Exception
java.lang.Exception
protected void topicalize(java.lang.String leader, java.util.ArrayList<MarcField> datafields, java.util.HashMap<java.lang.String,java.lang.String> controlfields, java.util.ArrayList<java.lang.String> subjectIdentifiers, java.util.ArrayList<java.lang.String> basenames, TopicMap tm)
public void processControlField(java.lang.String field, java.lang.String data, Topic record, Topic type, TopicMap tm) throws TopicMapException
TopicMapException
public Topic getLeaderType(TopicMap tm) throws TopicMapException
TopicMapException
public Topic getFieldTopic(java.lang.String field, TopicMap tm) throws TopicMapException
TopicMapException
public Topic getFieldType(TopicMap tm) throws TopicMapException
TopicMapException
protected java.lang.String makeFieldSI(java.lang.String field)
public Topic getSubFieldCodeType(TopicMap tm) throws TopicMapException
TopicMapException
public Topic getSubFieldCodeTopic(java.lang.String subfied, int counter, java.lang.String field, java.lang.String ind1Modifier, java.lang.String ind2Modifier, TopicMap tm) throws TopicMapException
TopicMapException
public Topic getSubFieldCodeTopic(java.lang.String subfied, java.lang.String field, java.lang.String ind1Modifier, java.lang.String ind2Modifier, TopicMap tm) throws TopicMapException
TopicMapException
public Topic getSubFieldDataTopic(java.lang.String data, java.lang.String tagModifier, java.lang.String ind1Modifier, java.lang.String ind2Modifier, TopicMap tm) throws TopicMapException
TopicMapException
public Topic getDataType(TopicMap tm) throws TopicMapException
TopicMapException
public Topic getInd1Topic(java.lang.String ind1, java.lang.String tag, TopicMap tm) throws TopicMapException
TopicMapException
public Topic getInd1Type(java.lang.String tag, TopicMap tm) throws TopicMapException
TopicMapException
public Topic getIndType(TopicMap tm) throws TopicMapException
TopicMapException
public Topic getInd2Topic(java.lang.String ind2, java.lang.String tag, TopicMap tm) throws TopicMapException
TopicMapException
public Topic getInd2Type(java.lang.String tag, TopicMap tm) throws TopicMapException
TopicMapException
public java.lang.String getIndicatorName(java.lang.String field, java.lang.String indicatorId, java.lang.String value)
public java.lang.String getIndicatorValueName(java.lang.String field, java.lang.String indicatorId, java.lang.String value)
public Topic getRecordType(TopicMap tm) throws TopicMapException
TopicMapException
public Topic getMARCClass(TopicMap tm) throws TopicMapException
TopicMapException
public Topic getWandoraClass(TopicMap tm) throws TopicMapException
TopicMapException
public Topic getTopic(TopicMap tm, java.lang.String str, java.lang.String SIBase, Topic type) throws TopicMapException
TopicMapException
protected Topic getOrCreateTopic(TopicMap tm, java.lang.String si) throws TopicMapException
TopicMapException
protected Topic getOrCreateTopic(TopicMap tm, java.lang.String si, java.lang.String bn) throws TopicMapException
TopicMapException
protected Topic getOrCreateTopic(TopicMap tm, java.lang.String si, java.lang.String bn, Topic type) throws TopicMapException
TopicMapException
protected void makeSubclassOf(TopicMap tm, Topic t, Topic superclass) throws TopicMapException
TopicMapException
protected java.lang.String makeSI(java.lang.String base, java.lang.String endPoint)
Copyright 2004-2015 Wandora Team