NIH | National Cancer Institute | NCI Wiki  

WIKI MAINTENANCE NOTICE

Please be advised that NCI Wiki will be undergoing maintenance Monday, July 22nd between 1700 ET and 1800 ET and will be unavailable during this period.
Please ensure all work is saved before said time.

If you have any questions or concerns, please contact the CBIIT Atlassian Management Team.

Error rendering macro 'rw-search'

null

You are viewing an old version of this page. View the current version.

Compare with Current View Page History

« Previous Version 4 Current »

Java Code
/*
 * Copyright: (c) 2004-2009 Mayo Foundation for Medical Education and 
 * Research (MFMER). All rights reserved. MAYO, MAYO CLINIC, and the
 * triple-shield Mayo logo are trademarks and service marks of MFMER.
 *
 * Except as contained in the copyright notice above, or as used to identify 
 * MFMER as the author of this software, the trade names, trademarks, service
 * marks, or product names of the copyright holder shall not be used in
 * advertising, promotion or otherwise in connection with this software without
 * prior written authorization of the copyright holder.
 * 
 * Licensed under the Eclipse Public License, Version 1.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at 
 * 
 * 		http://www.eclipse.org/legal/epl-v10.html
 * 
 */
package org.LexGrid.LexBIG.example;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.TreeMap;

import org.LexGrid.LexBIG.DataModel.Collections.LocalNameList;
import org.LexGrid.LexBIG.DataModel.Collections.ResolvedConceptReferenceList;
import org.LexGrid.LexBIG.DataModel.Core.CodingSchemeSummary;
import org.LexGrid.LexBIG.DataModel.Core.CodingSchemeVersionOrTag;
import org.LexGrid.LexBIG.DataModel.Core.ResolvedConceptReference;
import org.LexGrid.LexBIG.Exceptions.LBException;
import org.LexGrid.LexBIG.Impl.LexBIGServiceImpl;
import org.LexGrid.LexBIG.LexBIGService.CodedNodeSet;
import org.LexGrid.LexBIG.LexBIGService.LexBIGService;
import org.LexGrid.LexBIG.LexBIGService.CodedNodeSet.SearchDesignationOption;
import org.LexGrid.LexBIG.Utility.Constructors;
import org.LexGrid.LexBIG.Utility.Iterators.ResolvedConceptReferencesIterator;
import org.LexGrid.concepts.Entity;
import org.LexGrid.concepts.Presentation;

/**
 * Example showing how to find codes matching descriptive text. The program
 * accepts up to two parameters...
 * 
 * The first param (required) indicates the text used to search matching
 * descriptions. Matches are determined through a customized match algorithm,
 * which uses a simple heuristic to try and rank returned values by relevance.
 * 
 * The second param (optional) indicates the type of entity to search. Possible
 * values include the LexGrid built-in types "concept" and "instance".
 * Additional supported types can be defined uniquely to a coding scheme. If
 * provided, this should be a comma-delimited list of types. If not provided,
 * all entity types are searched.
 * 
 * Example: FindCodesForDescription "blood" Example: FindCodesForDescription
 * "breast cancer" "concept"
 * 
 */
public class FindCodesForDescription {
    // Identify common stop words (words to be ignored in most match
    // circumstances).
    // This list extends from the LVG stop words ...
    static final List<String> STOP_WORDS = Arrays.asList(new String[] { "a", "an", "and", "by", "for", "of", "on",
            "in", "nos", "the", "to", "with" });

    public FindCodesForDescription() {
        super();
    }

    /**
     * Entry point for processing.
     * 
     * @param args
     */
    public static void main(String[] args) {
        if (args.length < 1) {
            System.out.println("Example: FindCodesForDescription \"breast cancer\" \"concept\"");
            return;
        }

        try {
            String phrase = args[0];
            String[] nodeTypes = null;
            if (args.length > 1)
                nodeTypes = args[1].split(",");
            new FindCodesForDescription().run(phrase, nodeTypes);
        } catch (Exception e) {
            Util.displayAndLogError("REQUEST FAILED !!!", e);
        }
    }

    public void run(String phrase, String[] nodeTypes) throws LBException {
        CodingSchemeSummary css = Util.promptForCodeSystem();
        if (css != null) {
            LexBIGService lbs = LexBIGServiceImpl.defaultInstance();
            CodingSchemeVersionOrTag csvt = new CodingSchemeVersionOrTag();
            csvt.setVersion(css.getRepresentsVersion());
            LocalNameList typeList = Constructors.createLocalNameList(nodeTypes);

            ResolvedConceptReferencesIterator nodeRefs = search(lbs, css.getLocalName(), csvt, phrase, typeList);
            if (!nodeRefs.hasNext())
                Util.displayMessage("No matches found.");
            while (nodeRefs.hasNext()) {
                ResolvedConceptReference rcr = nodeRefs.next();
                Util.displayMessage("Code: " + rcr.getConceptCode());
                Util.displayMessage("\tCoding Scheme Name...: " + rcr.getCodingSchemeName());
                Util.displayMessage("\tCoding Scheme URI....: " + rcr.getCodingSchemeURI());
                Util.displayMessage("\tCoding Scheme Version: " + rcr.getCodingSchemeVersion());
                Util.displayMessage("\tCode Namespace...... : "
                        + (rcr.getCodeNamespace() != null ? rcr.getCodeNamespace() : "<default>"));
                Util.displayMessage("\tCode Description.... : "
                        + (rcr.getEntityDescription() != null ? rcr.getEntityDescription().getContent() : ""));
                String typeString = "";
                for (Iterator<String> types = rcr.iterateEntityType(); types.hasNext();)
                    typeString += (types.next() + (types.hasNext() ? "," : ""));
                Util.displayMessage("\tCode Entity Types... : " + typeString);
            }
        }
    }

    protected ResolvedConceptReferencesIterator search(LexBIGService lbs, String codingSchemeName,
            CodingSchemeVersionOrTag csvt, String phrase, LocalNameList nodeTypeList) {
        try {
            CodedNodeSet cns = lbs.getNodeSet(codingSchemeName, csvt, nodeTypeList);
            cns.restrictToMatchingDesignations(phrase, SearchDesignationOption.PREFERRED_ONLY,
                    "DoubleMetaphoneLuceneQuery", null);
            ResolvedConceptReferencesIterator resultIterator = cns.resolve(null, null, null, null, true);
            return sortByScore(phrase, resultIterator, 100);
        } catch (Exception ex) {
            ex.printStackTrace();
            return null;
        }
    }

    /**
     * Sorts the given concept references based on a scoring algorithm designed
     * to provide a more natural ordering. Scores are determined by comparing
     * each reference against a provided search term.
     * 
     * @param searchTerm
     *            The term used for comparison; single or multi-word.
     * @param toSort
     *            The iterator containing references to sort.
     * @param maxToReturn
     *            Sets upper limit for number of top-scored items returned.
     * @return Iterator over sorted references.
     * @throws LBException
     */
    protected ResolvedConceptReferencesIterator sortByScore(String searchTerm,
            ResolvedConceptReferencesIterator toSort, int maxToReturn) throws LBException {
        // Determine the set of individual words to compare against.
        List<String> compareWords = toScoreWords(searchTerm);

        // Create a bucket to store results.
        Map<String, ScoredTerm> scoredResult = new TreeMap<String, ScoredTerm>();

        // Score all items ...
        while (toSort.hasNext()) {
            // Working in chunks of 100.
            ResolvedConceptReferenceList refs = toSort.next(100);
            for (int i = 0; i < refs.getResolvedConceptReferenceCount(); i++) {
                ResolvedConceptReference ref = refs.getResolvedConceptReference(i);
                String code = ref.getConceptCode();
                Entity node = ref.getEntity();

                // Note: Preferred descriptions carry more weight,
                // but we process all terms to allow the score to improve
                // based on any contained presentation.
                Presentation[] allTermsForEntity = node.getPresentation();
                for (Presentation p : allTermsForEntity) {
                    float score = score(p.getValue().getContent(), compareWords, p.isIsPreferred(), i);

                    // Check for a previous match on this code for a different
                    // presentation.
                    // If already present, keep the highest score.
                    if (scoredResult.containsKey(code)) {
                        ScoredTerm scoredTerm = (ScoredTerm) scoredResult.get(code);
                        if (scoredTerm.score > score)
                            continue;
                    }
                    scoredResult.put(code, new ScoredTerm(ref, score));
                }
            }
        }
        // Return an iterator that will sort the scored result.
        return new ScoredIterator(scoredResult.values(), maxToReturn);
    }

    /**
     * Returns a score providing a relative comparison of the given text against
     * a set of keywords.
     *
     * Currently the score is evaluated as a simple percentage based on number
     * of words in the first set that are also in the second, with minor
     * adjustment for order (matching later words given slightly higher weight,
     * anticipating often the subject of search will follow descriptive text).
     * Weight is also increased for shorter phrases (measured in # words) If the
     * text is indicated to be preferred, the score is doubled to promote
     * 'bubbling to the top'.
     *
     * Ranking from the original search is available but not currently used in
     * the heuristic (tends to throw-off desired alphabetic groupings later).
     * 
     * @param text
     * @param keywords
     * @param isPreferred
     * @param searchRank
     * @return The score; a higher value indicates a stronger match.
     */
    protected float score(String text, List<String> keywords, boolean isPreferred, float searchRank) {
        List<String> wordsToCompare = toScoreWords(text);
        float totalWords = wordsToCompare.size();
        float matchScore = 0;
        float position = 0;
        for (Iterator<String> words = wordsToCompare.listIterator(); words.hasNext(); position++) {
            String word = words.next();
            if (keywords.contains(word))
                matchScore += ((position / 10) + 1);
        }
        return Math.max(0, 100 + (matchScore / totalWords * 100) - (totalWords * 2)) * (isPreferred ? 2 : 1);
    }

    /**
     * Return words from the given string to be used in scoring algorithms, in
     * order of occurrence and ignoring duplicates, stop words, whitespace and
     * common separators.
     * 
     * @param s
     * @return List
     */
    protected List<String> toScoreWords(String s) {
        return toWords(s, "[\\s,:+-;]", true, true);
    }

    /**
     * Return words from the given string in order of occurrence, normalized to
     * lower case, separated by the given delimiters (regular expression), and
     * optionally removing stop words and duplicates.
     * 
     * @param s
     * @param delimitRegex
     * @param removeStopWords
     * @param removeDuplicates
     * @return List<String>
     */
    protected List<String> toWords(String s, String delimitRegex, boolean removeStopWords, boolean removeDuplicates) {
        String[] words = s.split(delimitRegex);
        List<String> adjusted = new ArrayList<String>();
        for (int i = 0; i < words.length; i++) {
            String temp = words[i].toLowerCase();
            if (removeDuplicates && adjusted.contains(temp))
                continue;
            if (!removeStopWords || !STOP_WORDS.contains(temp))
                adjusted.add(temp);
        }
        return adjusted;
    }
}
  • No labels