NIH | National Cancer Institute | NCI Wiki  

Java Code
/*
 * Copyright: (c) 2004-2009 Mayo Foundation for Medical Education and 
 * Research (MFMER). All rights reserved. MAYO, MAYO CLINIC, and the
 * triple-shield Mayo logo are trademarks and service marks of MFMER.
 *
 * Except as contained in the copyright notice above, or as used to identify 
 * MFMER as the author of this software, the trade names, trademarks, service
 * marks, or product names of the copyright holder shall not be used in
 * advertising, promotion or otherwise in connection with this software without
 * prior written authorization of the copyright holder.
 * 
 * Licensed under the Eclipse Public License, Version 1.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at 
 * 
 * 		http://www.eclipse.org/legal/epl-v10.html
 * 
 */
package org.LexGrid.LexBIG.example;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.TreeMap;

import org.LexGrid.LexBIG.DataModel.Collections.LocalNameList;
import org.LexGrid.LexBIG.DataModel.Collections.ResolvedConceptReferenceList;
import org.LexGrid.LexBIG.DataModel.Core.CodingSchemeSummary;
import org.LexGrid.LexBIG.DataModel.Core.CodingSchemeVersionOrTag;
import org.LexGrid.LexBIG.DataModel.Core.ResolvedConceptReference;
import org.LexGrid.LexBIG.Exceptions.LBException;
import org.LexGrid.LexBIG.Impl.LexBIGServiceImpl;
import org.LexGrid.LexBIG.LexBIGService.CodedNodeSet;
import org.LexGrid.LexBIG.LexBIGService.LexBIGService;
import org.LexGrid.LexBIG.LexBIGService.CodedNodeSet.SearchDesignationOption;
import org.LexGrid.LexBIG.Utility.Constructors;
import org.LexGrid.LexBIG.Utility.Iterators.ResolvedConceptReferencesIterator;
import org.LexGrid.concepts.Entity;
import org.LexGrid.concepts.Presentation;

/**
 * Example showing how to find codes matching descriptive text. The program
 * accepts up to two parameters...
 * 
 * The first param (required) indicates the text used to search matching
 * descriptions. Matches are determined through a customized match algorithm,
 * which uses a simple heuristic to try and rank returned values by relevance.
 * 
 * The second param (optional) indicates the type of entity to search. Possible
 * values include the LexGrid built-in types "concept" and "instance".
 * Additional supported types can be defined uniquely to a coding scheme. If
 * provided, this should be a comma-delimited list of types. If not provided,
 * all entity types are searched.
 * 
 * Example: FindCodesForDescription "blood" Example: FindCodesForDescription
 * "breast cancer" "concept"
 * 
 */
public class FindCodesForDescription {
    // Identify common stop words (words to be ignored in most match
    // circumstances).
    // This list extends from the LVG stop words ...
    static final List<String> STOP_WORDS = Arrays.asList(new String[] { "a", "an", "and", "by", "for", "of", "on",
            "in", "nos", "the", "to", "with" });

    public FindCodesForDescription() {
        super();
    }

    /**
     * Entry point for processing.
     * 
     * @param args
     */
    public static void main(String[] args) {
        if (args.length < 1) {
            System.out.println("Example: FindCodesForDescription \"breast cancer\" \"concept\"");
            return;
        }

        try {
            String phrase = args[0];
            String[] nodeTypes = null;
            if (args.length > 1)
                nodeTypes = args[1].split(",");
            new FindCodesForDescription().run(phrase, nodeTypes);
        } catch (Exception e) {
            Util.displayAndLogError("REQUEST FAILED !!!", e);
        }
    }

    public void run(String phrase, String[] nodeTypes) throws LBException {
        CodingSchemeSummary css = Util.promptForCodeSystem();
        if (css != null) {
            LexBIGService lbs = LexBIGServiceImpl.defaultInstance();
            CodingSchemeVersionOrTag csvt = new CodingSchemeVersionOrTag();
            csvt.setVersion(css.getRepresentsVersion());
            LocalNameList typeList = Constructors.createLocalNameList(nodeTypes);

            ResolvedConceptReferencesIterator nodeRefs = search(lbs, css.getLocalName(), csvt, phrase, typeList);
            if (!nodeRefs.hasNext())
                Util.displayMessage("No matches found.");
            while (nodeRefs.hasNext()) {
                ResolvedConceptReference rcr = nodeRefs.next();
                Util.displayMessage("Code: " + rcr.getConceptCode());
                Util.displayMessage("\tCoding Scheme Name...: " + rcr.getCodingSchemeName());
                Util.displayMessage("\tCoding Scheme URI....: " + rcr.getCodingSchemeURI());
                Util.displayMessage("\tCoding Scheme Version: " + rcr.getCodingSchemeVersion());
                Util.displayMessage("\tCode Namespace...... : "
                        + (rcr.getCodeNamespace() != null ? rcr.getCodeNamespace() : "<default>"));
                Util.displayMessage("\tCode Description.... : "
                        + (rcr.getEntityDescription() != null ? rcr.getEntityDescription().getContent() : ""));
                String typeString = "";
                for (Iterator<String> types = rcr.iterateEntityType(); types.hasNext();)
                    typeString += (types.next() + (types.hasNext() ? "," : ""));
                Util.displayMessage("\tCode Entity Types... : " + typeString);
            }
        }
    }

    protected ResolvedConceptReferencesIterator search(LexBIGService lbs, String codingSchemeName,
            CodingSchemeVersionOrTag csvt, String phrase, LocalNameList nodeTypeList) {
        try {
            CodedNodeSet cns = lbs.getNodeSet(codingSchemeName, csvt, nodeTypeList);
            cns.restrictToMatchingDesignations(phrase, SearchDesignationOption.PREFERRED_ONLY,
                    "DoubleMetaphoneLuceneQuery", null);
            ResolvedConceptReferencesIterator resultIterator = cns.resolve(null, null, null, null, true);
            return sortByScore(phrase, resultIterator, 100);
        } catch (Exception ex) {
            ex.printStackTrace();
            return null;
        }
    }

    /**
     * Sorts the given concept references based on a scoring algorithm designed
     * to provide a more natural ordering. Scores are determined by comparing
     * each reference against a provided search term.
     * 
     * @param searchTerm
     *            The term used for comparison; single or multi-word.
     * @param toSort
     *            The iterator containing references to sort.
     * @param maxToReturn
     *            Sets upper limit for number of top-scored items returned.
     * @return Iterator over sorted references.
     * @throws LBException
     */
    protected ResolvedConceptReferencesIterator sortByScore(String searchTerm,
            ResolvedConceptReferencesIterator toSort, int maxToReturn) throws LBException {
        // Determine the set of individual words to compare against.
        List<String> compareWords = toScoreWords(searchTerm);

        // Create a bucket to store results.
        Map<String, ScoredTerm> scoredResult = new TreeMap<String, ScoredTerm>();

        // Score all items ...
        while (toSort.hasNext()) {
            // Working in chunks of 100.
            ResolvedConceptReferenceList refs = toSort.next(100);
            for (int i = 0; i < refs.getResolvedConceptReferenceCount(); i++) {
                ResolvedConceptReference ref = refs.getResolvedConceptReference(i);
                String code = ref.getConceptCode();
                Entity node = ref.getEntity();

                // Note: Preferred descriptions carry more weight,
                // but we process all terms to allow the score to improve
                // based on any contained presentation.
                Presentation[] allTermsForEntity = node.getPresentation();
                for (Presentation p : allTermsForEntity) {
                    float score = score(p.getValue().getContent(), compareWords, p.isIsPreferred(), i);

                    // Check for a previous match on this code for a different
                    // presentation.
                    // If already present, keep the highest score.
                    if (scoredResult.containsKey(code)) {
                        ScoredTerm scoredTerm = (ScoredTerm) scoredResult.get(code);
                        if (scoredTerm.score > score)
                            continue;
                    }
                    scoredResult.put(code, new ScoredTerm(ref, score));
                }
            }
        }
        // Return an iterator that will sort the scored result.
        return new ScoredIterator(scoredResult.values(), maxToReturn);
    }

    /**
     * Returns a score providing a relative comparison of the given text against
     * a set of keywords.
     *
     * Currently the score is evaluated as a simple percentage based on number
     * of words in the first set that are also in the second, with minor
     * adjustment for order (matching later words given slightly higher weight,
     * anticipating often the subject of search will follow descriptive text).
     * Weight is also increased for shorter phrases (measured in # words) If the
     * text is indicated to be preferred, the score is doubled to promote
     * 'bubbling to the top'.
     *
     * Ranking from the original search is available but not currently used in
     * the heuristic (tends to throw-off desired alphabetic groupings later).
     * 
     * @param text
     * @param keywords
     * @param isPreferred
     * @param searchRank
     * @return The score; a higher value indicates a stronger match.
     */
    protected float score(String text, List<String> keywords, boolean isPreferred, float searchRank) {
        List<String> wordsToCompare = toScoreWords(text);
        float totalWords = wordsToCompare.size();
        float matchScore = 0;
        float position = 0;
        for (Iterator<String> words = wordsToCompare.listIterator(); words.hasNext(); position++) {
            String word = words.next();
            if (keywords.contains(word))
                matchScore += ((position / 10) + 1);
        }
        return Math.max(0, 100 + (matchScore / totalWords * 100) - (totalWords * 2)) * (isPreferred ? 2 : 1);
    }

    /**
     * Return words from the given string to be used in scoring algorithms, in
     * order of occurrence and ignoring duplicates, stop words, whitespace and
     * common separators.
     * 
     * @param s
     * @return List
     */
    protected List<String> toScoreWords(String s) {
        return toWords(s, "[\\s,:+-;]", true, true);
    }

    /**
     * Return words from the given string in order of occurrence, normalized to
     * lower case, separated by the given delimiters (regular expression), and
     * optionally removing stop words and duplicates.
     * 
     * @param s
     * @param delimitRegex
     * @param removeStopWords
     * @param removeDuplicates
     * @return List<String>
     */
    protected List<String> toWords(String s, String delimitRegex, boolean removeStopWords, boolean removeDuplicates) {
        String[] words = s.split(delimitRegex);
        List<String> adjusted = new ArrayList<String>();
        for (int i = 0; i < words.length; i++) {
            String temp = words[i].toLowerCase();
            if (removeDuplicates && adjusted.contains(temp))
                continue;
            if (!removeStopWords || !STOP_WORDS.contains(temp))
                adjusted.add(temp);
        }
        return adjusted;
    }
}