NIH | National Cancer Institute | NCI Wiki  

Java Code
/*
 * Copyright: (c) 2004-2009 Mayo Foundation for Medical Education and 
 * Research (MFMER). All rights reserved. MAYO, MAYO CLINIC, and the
 * triple-shield Mayo logo are trademarks and service marks of MFMER.
 *
 * Except as contained in the copyright notice above, or as used to identify 
 * MFMER as the author of this software, the trade names, trademarks, service
 * marks, or product names of the copyright holder shall not be used in
 * advertising, promotion or otherwise in connection with this software without
 * prior written authorization of the copyright holder.
 * 
 * Licensed under the Eclipse Public License, Version 1.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at 
 * 
 * 		http://www.eclipse.org/legal/epl-v10.html
 * 
 */
package org.LexGrid.LexBIG.example;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Set;

import org.LexGrid.LexBIG.DataModel.Collections.ResolvedConceptReferenceList;
import org.LexGrid.LexBIG.DataModel.Collections.SortOptionList;
import org.LexGrid.LexBIG.DataModel.Core.CodingSchemeSummary;
import org.LexGrid.LexBIG.DataModel.Core.CodingSchemeVersionOrTag;
import org.LexGrid.LexBIG.DataModel.Core.ResolvedConceptReference;
import org.LexGrid.LexBIG.Exceptions.LBException;
import org.LexGrid.LexBIG.Impl.LexBIGServiceImpl;
import org.LexGrid.LexBIG.LexBIGService.CodedNodeSet;
import org.LexGrid.LexBIG.LexBIGService.LexBIGService;
import org.LexGrid.LexBIG.LexBIGService.CodedNodeSet.PropertyType;
import org.LexGrid.LexBIG.LexBIGService.CodedNodeSet.SearchDesignationOption;
import org.LexGrid.LexBIG.Utility.Constructors;
import org.LexGrid.LexBIG.Utility.LBConstants.MatchAlgorithms;
import org.LexGrid.concepts.Entity;

/**
 * Example attempting to approximate some characteristics of the Metaphrase
 * search algorithm. However, full Metaphrase compatibility is not anticipated.
 */
public class MetaMatch {

    // Identify common stop words (words to be ignored in most match
    // circumstances).
    // This list extends from the LVG stop words ...
    static final List<String> STOP_WORDS = Arrays.asList(new String[] { "a", "an", "and", "by", "for", "of", "on",
            "in", "nos", "the", "to", "with" });

    /**
     * Default constructor.
     */
    public MetaMatch() {
        super();
    }

    /**
     * Entry point for processing.
     * 
     * @param args
     */
    public static void main(String[] args) {
        if (args.length < 1) {
            System.out.println("Example: MetaMatch \"test string\"");
            return;
        }
        ;

        try {
            String s = args[0];
            new MetaMatch().run(s);
        } catch (Exception e) {
            Util.displayAndLogError("REQUEST FAILED !!!", e);
        }
    }

    public void run(String s) throws LBException {
        CodingSchemeSummary css = Util.promptForCodeSystem();
        if (css != null) {
            LexBIGService lbSvc = LexBIGServiceImpl.defaultInstance();
            String scheme = css.getCodingSchemeURI();
            CodingSchemeVersionOrTag csvt = new CodingSchemeVersionOrTag();
            csvt.setVersion(css.getRepresentsVersion());

            matchSynonyms(s, lbSvc, scheme, csvt);
            matchSpell(s, lbSvc, scheme, csvt);
            matchTermCompletion(s, lbSvc, scheme, csvt);
            matchWordCompletion(s, lbSvc, scheme, csvt);
            matchSubquery(s, lbSvc, scheme, csvt);
        }
    }

    /**
     * Display concepts and related text strings matching the given string.
     * 
     * @param s
     *            The test string.
     * @param lbSvc
     * @param scheme
     * @param csvt
     * @throws LBException
     */
    protected void matchSynonyms(String s, LexBIGService lbSvc, String scheme, CodingSchemeVersionOrTag csvt)
            throws LBException {
        Util.displayMessage("\n*** Matching synonyms (incorporates partial normalization/stemming).");

        // An attempt is made to use a partial normalized form on search by
        // removing stop words and utilizing a stemmed query algorithm.
        // Query is performed over all text representations (preferred and
        // non-preferred) that contain every non-stop word in the given
        // string.
        //
        // On resolve, we sort by lucene score ('matchToQuery' sort algorithm).

        StringBuffer searchPhrase = new StringBuffer();
        String[] words = toWords(s, true);
        for (int i = 0; i < words.length; i++) {
            if (i > 0)
                searchPhrase.append(" AND ");
            searchPhrase.append(words[i]);
        }

        // Define the code set and add restrictions ...
        CodedNodeSet nodes = lbSvc.getNodeSet(scheme, csvt, null);
        nodes.restrictToMatchingDesignations(searchPhrase.toString(), SearchDesignationOption.ALL,
                "StemmedLuceneQuery", null);
        SortOptionList sortCriteria = Constructors.createSortOptionList(new String[] { "matchToQuery" });
        PropertyType[] fetchTypes = new PropertyType[] { PropertyType.PRESENTATION };

        // Resolve and analyze the result ...
        ResolvedConceptReferenceList matches = nodes.resolveToList(sortCriteria, null, fetchTypes, -1);

        // Found a match? If so, sort according to relevance.
        if (matches.getResolvedConceptReferenceCount() > 0) {
            // NOTE that a match so far only indicates that all of the words in
            // the
            // passed in string also exist in a term assigned to the resolved
            // concepts. It does not, however, exclude results where the terms
            // also have additional words. This code currently processes only
            // full matches by calling getReferenceWeight and taking only those
            // values with weight '1'.

            // List concept references with exact correlation of at least one
            // concept term to the compare string.
            final List<String> matchWords = Arrays.asList(words);
            for (int i = 0; i < matches.getResolvedConceptReferenceCount(); i++) {
                ResolvedConceptReference ref = matches.getResolvedConceptReference(i);
                if (getReferenceWeight(ref, matchWords) == 1)
                    printText(ref);
            }
        } else {
            Util.displayMessage("\tNo match found.");
        }
    }

    /**
     * Attempt to approximate spelling suggestions.
     * 
     * @param s
     *            The test string.
     * @param lbSvc
     * @param scheme
     * @param csvt
     * @throws LBException
     */
    protected void matchSpell(String s, LexBIGService lbSvc, String scheme, CodingSchemeVersionOrTag csvt)
            throws LBException {
        Util.displayMessage("\n*** Matching spell suggestions ...");

        // Note: Metaphrase contained an integrated spell check module;
        // LexBIG does not as of the 2.0 release. Instead, spell
        // suggestion is based off entire phrase in comparison
        // to content stored in the ontology; no external dictionary
        // is utilized. While a dedicated speller has advantages,
        // there are also advantages to this approach (e.g. automatic
        // tuning of the search to the ontology content).
        //
        // Two different algorithms are highlighted in this code
        // for consideration. The first utilizes the lucene-based
        // Levenshtein distance algorithm. The second utilizes the
        // DoubleMetaphoneLuceneQuery (sounds-like) algorithm.
        //
        // Queries are currently made against all text representations
        // and not just the primary text representation for ,
        // but this could be narrowed.
        //
        // On resolve, we sort by lucene score ('matchToQuery' sort algorithm).

        // Levenshtein 'edit distance' processing ...
        String[] words = toWords(s, true);
        String prefix = words[0].substring(0, 1);

        CodedNodeSet nodes = lbSvc.getNodeSet(scheme, csvt, null);
        StringBuffer searchPhrase = new StringBuffer(s).append('~');
        nodes.restrictToMatchingDesignations(searchPhrase.toString(), SearchDesignationOption.ALL,
                MatchAlgorithms.LuceneQuery.toString(), null);

        SortOptionList sortCriteria = Constructors.createSortOptionList(new String[] { "matchToQuery" });

        // Resolve and analyze the result ...
        ResolvedConceptReferenceList matches = nodes.resolveToList(sortCriteria, null, new PropertyType[] {}, 5);

        // Found a match?
        Util.displayMessage("\tLevenshtein distance (limited to 5 hits): ");
        if (matches.getResolvedConceptReferenceCount() > 0) {
            for (int i = 0; i < matches.getResolvedConceptReferenceCount(); i++) {
                // OK, so we have it narrowed to a concept at this point but not
                // a
                // specific term. Narrow the choices to those that include the
                // same
                // starting letter and number of words as our test phrase.
                ResolvedConceptReference ref = matches.getResolvedConceptReference(i);
                printText(ref, true, prefix, words.length);
            }
        } else {
            Util.displayMessage("\tNo match found.");
        }

        // Sounds-like processing ...
        nodes = lbSvc.getNodeSet(scheme, csvt, null);
        nodes.restrictToMatchingDesignations(s, SearchDesignationOption.ALL, MatchAlgorithms.DoubleMetaphoneLuceneQuery
                .toString(), null);

        // Resolve and analyze the result ...
        matches = nodes.resolveToList(sortCriteria, null, new PropertyType[] {}, 5);

        // Found a match?
        Util.displayMessage("\n\tSounds-like/DoubleMetaphone (limited to 5 hits): ");
        if (matches.getResolvedConceptReferenceCount() > 0) {
            for (int i = 0; i < matches.getResolvedConceptReferenceCount(); i++) {
                // Once again we have it narrowed to an entry at this point but
                // not a specific term. Narrow the choices to those that include
                // the
                // same starting letter and number of words as our test phrase.
                ResolvedConceptReference ref = matches.getResolvedConceptReference(i);
                printText(ref, true, prefix, words.length);
            }
        } else {
            Util.displayMessage("\tNo match found.");
        }

    }

    /**
     * Attempt to approximate term completion.
     * 
     * @param s
     *            The test string.
     * @param lbSvc
     * @param scheme
     * @param csvt
     * @throws LBException
     */
    @SuppressWarnings("unchecked")
    protected void matchTermCompletion(String s, LexBIGService lbSvc, String scheme, CodingSchemeVersionOrTag csvt)
            throws LBException {
        Util.displayMessage("\n*** Term completion ...");

        // Term completion is basically 'startsWith'. Two different
        // queries are tried to match. The first utilizes the standard
        // startsWith match algorithm. The second utilizes regular
        // expressions to try and accomplish short-circuiting more
        // efficiently.
        //
        // Queries are currently made against all text representations,
        // but this could be narrowed.
        //
        // On resolve, we sort by lucene score ('matchToQuery' sort algorithm).

        String prefix = s.toLowerCase();
        String[] words = toWords(s, false);
        CodedNodeSet nodes = lbSvc.getNodeSet(scheme, csvt, null);
        nodes.restrictToMatchingDesignations(prefix, SearchDesignationOption.ALL, "startsWith", null);
        SortOptionList sortCriteria = Constructors.createSortOptionList(new String[] { "matchToQuery" });

        // Resolve and analyze the result ...
        ResolvedConceptReferenceList matches = nodes.resolveToList(sortCriteria, null, new PropertyType[] {}, 5);

        // Found a match?
        Util.displayMessage("\tstartsWith (limited to 5 hits); short circuit after query...");
        if (matches.getResolvedConceptReferenceCount() > 0) {
            for (int i = 0; i < matches.getResolvedConceptReferenceCount(); i++) {
                ResolvedConceptReference ref = matches.getResolvedConceptReference(i);
                printText(ref, false, prefix, words.length + 1);
            }
        } else {
            Util.displayMessage("\tNo match found.");
        }

        // Try again with regular expression.
        nodes = lbSvc.getNodeSet(scheme, csvt, null);
        String regex = '^' + prefix + "\\W*\\w*\\Z";
        nodes.restrictToMatchingDesignations(regex, SearchDesignationOption.ALL, "RegExp", null);

        // Resolve and analyze the result ...
        matches = nodes.resolveToList(sortCriteria, null, new PropertyType[] {}, 5);

        // Found a match?
        Util.displayMessage("\n\tRegExp match (limited to 5 hits); short circuit on query...");
        if (matches.getResolvedConceptReferenceCount() > 0) {
            for (int i = 0; i < matches.getResolvedConceptReferenceCount(); i++) {
                ResolvedConceptReference ref = matches.getResolvedConceptReference(i);
                printText(ref, false, prefix, words.length + 1);
            }
        } else {
            Util.displayMessage("\tNo match found.");
        }
    }

    /**
     * Attempt to approximate word completion.
     * 
     * @param s
     *            The test string.
     * @param lbSvc
     * @param scheme
     * @param csvt
     * @throws LBException
     */
    @SuppressWarnings("unchecked")
    protected void matchWordCompletion(String s, LexBIGService lbSvc, String scheme, CodingSchemeVersionOrTag csvt)
            throws LBException {
        Util.displayMessage("\n*** Word completion (based on '.') ...");

        // For word completion, we just use a regular expression and
        // substitute any alpha-numeric character for the substitution
        // character ('.').
        // Queries are currently made against all text representations,
        // but this could be narrowed.
        //
        // On resolve, we sort by lucene score ('matchToQuery' sort algorithm).

        String prefix = s.toLowerCase();
        String[] words = toWords(prefix, false);
        StringBuffer regex = new StringBuffer();
        regex.append('^');
        for (int i = 0; i < words.length; i++) {
            boolean lastWord = i == words.length - 1;
            String word = words[i];
            regex.append('(');
            if (word.charAt(word.length() - 1) == '.') {
                regex.append(word.substring(0, word.length()));
                regex.append("\\w*");
            } else
                regex.append(word);
            regex.append("\\s").append(lastWord ? '*' : '+');
            regex.append(')');
        }
        regex.append("\\Z");

        CodedNodeSet nodes = lbSvc.getNodeSet(scheme, csvt, null);
        nodes.restrictToMatchingDesignations(regex.toString(), SearchDesignationOption.ALL, "RegExp", null);
        SortOptionList sortCriteria = Constructors.createSortOptionList(new String[] { "matchToQuery" });

        // Resolve and analyze the result ...
        ResolvedConceptReferenceList matches = nodes.resolveToList(sortCriteria, null, new PropertyType[] {}, 5);

        // Found a match?
        Util.displayMessage("\tFound via RegExp match (limited to 5 hits)...");
        if (matches.getResolvedConceptReferenceCount() > 0) {
            int dotIndex = prefix.indexOf('.');
            if (dotIndex < 0)
                dotIndex = prefix.length();
            for (int i = 0; i < matches.getResolvedConceptReferenceCount(); i++) {
                ResolvedConceptReference ref = matches.getResolvedConceptReference(i);
                printText(ref, false, prefix.substring(0, dotIndex), -1);
            }
        } else {
            Util.displayMessage("\tNo match found.");
        }

    }

    /**
     * Attempt to approximate compositional or sub-query match (e.g.,
     * "peptic ulcer" will match the two separate entries for "peptic" and
     * "ulcer", in case the ontology does not contain any entry matching the
     * full text "peptic ulcer").
     * 
     * @param s
     *            The test string.
     * @param lbSvc
     * @param scheme
     * @param csvt
     * @throws LBException
     */
    protected void matchSubquery(String s, LexBIGService lbSvc, String scheme, CodingSchemeVersionOrTag csvt)
            throws LBException {
        Util.displayMessage("\n*** Subquery match (incorporates partial normalization/stemming).");

        // Similar to synonym match, only allow match on any single non-stop
        // word
        // and only print presentations matching the query.
        //
        // On resolve, we sort by lucene score ('matchToQuery' sort algorithm).

        StringBuffer searchPhrase = new StringBuffer();
        String[] words = toWords(s, true);
        for (int i = 0; i < words.length; i++) {
            if (i > 0)
                searchPhrase.append(" OR ");
            searchPhrase.append(words[i]);
        }

        // Define the code set and add restrictions ...
        CodedNodeSet nodes = lbSvc.getNodeSet(scheme, csvt, null);
        nodes.restrictToMatchingDesignations(searchPhrase.toString(), SearchDesignationOption.ALL,
                "StemmedLuceneQuery", null);
        SortOptionList sortCriteria = Constructors.createSortOptionList(new String[] { "matchToQuery" });
        PropertyType[] fetchTypes = new PropertyType[] { PropertyType.PRESENTATION };

        // Resolve and analyze the result ...
        ResolvedConceptReferenceList matches = nodes.resolveToList(sortCriteria, null, fetchTypes, -1);

        // Found a match? If so, print each associated presentation containing
        // at least
        // one word from the query expression (exclude non-matching synonyms).
        if (matches.getResolvedConceptReferenceCount() > 0) {
            for (int i = 0; i < matches.getResolvedConceptReferenceCount(); i++) {
                ResolvedConceptReference ref = matches.getResolvedConceptReference(i);
                printText(ref, words, 1);
            }
        } else {
            Util.displayMessage("\tNo match found.");
        }
    }

    /**
     * Return a relative weight between 0 and 1 that indicates how well the
     * given reference maps to a set of words.
     * 
     * @param ref
     * @param matchWords
     * @return
     */
    protected float getReferenceWeight(ResolvedConceptReference ref, List<String> matchWords) {
        float weight = 0;
        Entity ce = ref.getEntity();
        int presCount = ce.getPresentationCount();
        for (int i = 0; i < presCount; i++) {
            weight = Math.max(weight, getTextWeight(ce.getPresentation(i).getValue().getContent(), matchWords));
        }
        return weight;
    }

    /**
     * Return a relative weight between 0 and 1 that indicates how well the
     * given string maps to a set of words.
     * 
     * @param ref
     * @param matchWords
     * @return
     */
    protected float getTextWeight(String text, List<String> matchWords) {
        int matchedInTextWords = 0;
        int matchedInCompareWords = 0;
        String[] textWords = toWords(text, true);
        Set<String> textWordSet = new HashSet<String>();
        for (int i = 0; i < textWords.length; i++) {
            String word = textWords[i];
            textWordSet.add(word);
            if (matchWords.contains(word))
                matchedInCompareWords++;
        }
        for (Iterator<String> compareWords = matchWords.iterator(); compareWords.hasNext();) {
            String word = compareWords.next();
            if (textWordSet.contains(word))
                matchedInTextWords++;
        }
        float textToCompareRatio = matchedInCompareWords / textWordSet.size();
        float compareToTextRatio = matchedInTextWords / matchWords.size();
        return (textToCompareRatio + compareToTextRatio) / 2;
    }

    /**
     * Print code and text for all text presentations associated with the given
     * node reference.
     * 
     * @param ref
     */
    protected void printText(ResolvedConceptReference ref) {
        printText(ref, true, (String) null, -1);
    }

    /**
     * Print code and text for all text presentations associated with the given
     * concept reference. If specified, any printed terms will have stop words
     * removed prior to comparison, must include the given prefix, and can be
     * constrained to those with a set number of words.
     * 
     * @param ref
     * @param removeStopWords
     * @param prefix
     * @param wordCount
     */
    protected void printText(ResolvedConceptReference ref, boolean removeStopWords, String prefix, int wordCount) {
        // Identify the unique set of text presentations;
        // avoid duplicate registration by multiple sources, etc ...
        Entity node = ref.getEntity();
        Set<String> presentations = new HashSet<String>();
        for (int j = 0; j < node.getPresentationCount(); j++)
            presentations.add(node.getPresentation(j).getValue().getContent());

        // Print the result
        String code = "\t[" + ref.getConceptCode() + ']';
        for (Iterator<String> strings = presentations.iterator(); strings.hasNext();) {
            String term = strings.next();
            if ((prefix == null || term.toLowerCase().startsWith(prefix))
                    && (wordCount < 0 || toWords(term, removeStopWords).length == wordCount))
                Util.displayMessage(code + term);
        }
    }

    /**
     * Print code and text for all text presentations associated with the given
     * concept reference. If specified, the given words are compared against
     * each printed presentation according to the chosen match option. Match
     * option of 1 indicates to match at least one word; match option of 2
     * indicates a presentation must match all words.
     * 
     * @param ref
     * @param matchWords
     * @param matchOption
     */
    protected void printText(ResolvedConceptReference ref, String[] matchWords, int matchOption) {
        // Identify the unique set of text presentations;
        // avoid duplicate registration by multiple sources, etc ...
        Entity node = ref.getEntity();
        Set<String> presentations = new HashSet<String>();
        for (int j = 0; j < node.getPresentationCount(); j++)
            presentations.add(node.getPresentation(j).getValue().getContent());

        // Print matching results
        String code = "\t[" + ref.getConceptCode() + ']';
        for (Iterator<String> strings = presentations.iterator(); strings.hasNext();) {
            String term = strings.next();
            final List<String> words = Arrays.asList(matchWords);
            float weight = getTextWeight(term, words);
            if (matchOption == 1 && weight > 0 || matchOption == 2 && weight == 1)
                Util.displayMessage(code + term);
        }
    }

    /**
     * Returns an array containing the individual white-space delimited words
     * contained by the given string, normalizing to lowercase and optionally
     * removing stop words.
     * 
     * @param s
     * @param removeStopWords
     * @return
     */
    protected String[] toWords(String s, boolean removeStopWords) {
        String[] words = s.split("\\s");
        List<String> adjusted = new ArrayList<String>();
        for (int i = 0; i < words.length; i++) {
            if (!removeStopWords || !STOP_WORDS.contains(words[i]))
                adjusted.add(words[i].toLowerCase());
        }
        return adjusted.toArray(new String[adjusted.size()]);
    }

}