NIH | National Cancer Institute | NCI Wiki  

WIKI MAINTENANCE NOTICE

Please be advised that NCI Wiki will be undergoing maintenance Monday, July 22nd between 1700 ET and 1800 ET and will be unavailable during this period.
Please ensure all work is saved before said time.

If you have any questions or concerns, please contact the CBIIT Atlassian Management Team.

Versions Compared

Key

  • This line was added.
  • This line was removed.
  • Formatting was changed.
Comment: Migrated to Confluence 5.3
Code Block
languageJava
titleJava Code
/*
 * Copyright: (c) 2004-2009 Mayo Foundation for Medical Education and 
 * Research (MFMER). All rights reserved. MAYO, MAYO CLINIC, and the
 * triple-shield Mayo logo are trademarks and service marks of MFMER.
 *
 * Except as contained in the copyright notice above, or as used to identify 
 * MFMER as the author of this software, the trade names, trademarks, service
 * marks, or product names of the copyright holder shall not be used in
 * advertising, promotion or otherwise in connection with this software without
 * prior written authorization of the copyright holder.
 * 
 * Licensed under the Eclipse Public License, Version 1.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at 
 * 
 * 		http://www.eclipse.org/legal/epl-v10.html
 * 
 */
package org.LexGrid.LexBIG.example;

import java.util.Formatter;
import java.util.Iterator;
import java.util.SortedSet;
import java.util.StringTokenizer;
import java.util.TreeSet;

import org.LexGrid.LexBIG.DataModel.Collections.ResolvedConceptReferenceList;
import org.LexGrid.LexBIG.DataModel.Core.CodingSchemeSummary;
import org.LexGrid.LexBIG.DataModel.Core.CodingSchemeVersionOrTag;
import org.LexGrid.LexBIG.DataModel.Core.ResolvedConceptReference;
import org.LexGrid.LexBIG.Exceptions.LBException;
import org.LexGrid.LexBIG.Impl.LexBIGServiceImpl;
import org.LexGrid.LexBIG.LexBIGService.CodedNodeSet;
import org.LexGrid.LexBIG.LexBIGService.LexBIGService;
import org.LexGrid.LexBIG.LexBIGService.CodedNodeSet.PropertyType;
import org.LexGrid.LexBIG.LexBIGService.CodedNodeSet.SearchDesignationOption;
import org.LexGrid.LexBIG.Utility.ConvenienceMethods;
import org.LexGrid.LexBIG.Utility.Iterators.ResolvedConceptReferencesIterator;
import org.LexGrid.LexBIG.Utility.LBConstants.MatchAlgorithms;
import org.LexGrid.LexBIG.Utility.LBConstants.SortableProperties;
import org.LexGrid.concepts.Entity;
import org.LexGrid.concepts.Presentation;
import org.apache.commons.collections.BidiMap;
import org.apache.commons.collections.MapIterator;
import org.apache.commons.collections.bidimap.TreeBidiMap;

/**
 * Example showing a simple scoring algorithm that evaluates a provided term
 * against available terms in a code system. A cutoff percentage can optionally
 * be provided.
 */
public class ScoreTerm {

    // Inner class used to manage and sort results.
    protected class ScoredTerm implements Comparable {
        String term;
        float score;

        public int compareTo(Object o) {
            if (o instanceof ScoredTerm) {
                ScoredTerm st = (ScoredTerm) o;
                int i = Float.valueOf(st.score).compareTo(Float.valueOf(score));
                return (i != 0) ? i : term.compareTo(st.term);
            }
            return 0;
        }

    }

    public ScoreTerm() {
    }

    /**
     * Program entry point.
     * 
     * @param args
     *            String[]
     */
    public static void main(String[] args) {
        if (args.length < 1) {
            System.out.println("Example: ScoreTerm \"some term to evaluate\"");
            System.out.println("Example: ScoreTerm \"some term to evaluate\" 25%");
            return;
        }
        StringBuffer term = new StringBuffer(args[0]);
        float cutoff = 0;
        if (args.length > 1) {
            // Treat everything but last argument automatically as part of
            // search term. This helps in Linux where it is difficult to enter
            // spaces in the command line.
            int lastIndex = args.length - 1;
            for (int i = 1; i < lastIndex; i++)
                term.append(' ').append(args[i]);

            // Treat the last argument as cutoff if numeric, otherwise
            // consider part of the search term as well (same reason as above).
            String val = args[lastIndex].trim();
            try {
                if (val.endsWith("%"))
                    val = val.substring(0, val.length() - 1);
                cutoff = Float.valueOf(val);
            } catch (NumberFormatException nfe) {
                term.append(' ').append(val);
            }
        }
        try {
            ScoreTerm pgm = new ScoreTerm();
            pgm.run(term.toString(), cutoff);
        } catch (Exception e) {
            Util.displayAndLogError("REQUEST FAILED !!!", e);
        }
    }

    /**
     * Runs the score algorithm for a specific term.
     * 
     * @param term
     *            The text to evaluate.
     * @param score
     *            Lower cutoff (percentage); a value less than or equal to 0
     *            indicates no cutoff.
     * @throws Exception
     */
    public void run(String term, float minScore) throws Exception {
        // Allow the user to pick the target coding scheme.
        // This could also be hardcoded to a specific coding scheme.
        CodingSchemeSummary css = Util.promptForCodeSystem();
        if (css != null) {
            // Determine the set of individual words to compare against.
            SortedSet compareWords = toWords(term);

            // Create a bucket to store results.
            // Sort the results by score (highest score first) and code key.
            BidiMap scoredResult = new TreeBidiMap();

            // Resolve and iterate through matches to score each.
            // For this example, we keep the highest score per coded concept.
            for (ResolvedConceptReferencesIterator matches = resolveConcepts(css, term); matches.hasNext();) {
                // Work in chunks of 100.
                ResolvedConceptReferenceList refs = matches.next(100);
                for (int i = 0; i < refs.getResolvedConceptReferenceCount(); i++) {
                    ResolvedConceptReference ref = refs.getResolvedConceptReference(i);
                    String code = ref.getConceptCode();

                    Entity node = ref.getEntity();
                    Presentation[] allTermsForConcept = node.getPresentation();

                    for (int j = 0; j < allTermsForConcept.length; j++) {
                        Presentation p = allTermsForConcept[j];
                        String text = p.getValue().getContent();
                        float score = score(toWords(text), compareWords);
                        if (score > minScore) {
                            // Check for a previous match on this code for a
                            // different presentation.
                            // If already present save the item of most
                            // relevance.
                            if (scoredResult.containsKey(code)) {
                                ScoredTerm scoredTerm = (ScoredTerm) scoredResult.get(code);
                                if (scoredTerm.score > score)
                                    continue;
                            }
                            ScoredTerm scoredTerm = new ScoredTerm();
                            scoredTerm.term = text;
                            scoredTerm.score = score;
                            scoredResult.put(code, scoredTerm);
                        }
                    }
                }
            }

            // Print the results.
            printReport(scoredResult);
        }
    }

    /**
     * Resolves matching concepts for any word in the given term.
     * 
     * @param css
     *            The code system to search.
     * @param matchWords
     *            The term to match.
     * @return The list of matching references.
     * @throws LBException
     */
    protected ResolvedConceptReferencesIterator resolveConcepts(CodingSchemeSummary css, String query)
            throws LBException {
        // Define a code set over the target terminology and
        // restrict to concepts with matching text based on
        // the provided term.
        LexBIGService lbs = LexBIGServiceImpl.defaultInstance();
        CodingSchemeVersionOrTag csvt = new CodingSchemeVersionOrTag();
        csvt.setVersion(css.getRepresentsVersion());
        CodedNodeSet cns = lbs.getCodingSchemeConcepts(css.getLocalName(), csvt);

        // Restrict the code set.
        cns
                .restrictToMatchingDesignations(query, SearchDesignationOption.ALL, MatchAlgorithms.LuceneQuery.name(),
                        null);

        // Resolve the concepts and assigned text.
        ResolvedConceptReferencesIterator matches = cns.resolve(ConvenienceMethods
                .createSortOptionList(new String[] { SortableProperties.code.name() }), null,
                new PropertyType[] { PropertyType.PRESENTATION });
        return matches;
    }

    /**
     * Returns a score providing a relative comparison of the first set of words
     * against the second.
     * <p>
     * Currently the score is evaluated as a simple percentage based on number
     * of words in the first set that are also in the second (order
     * independent). This could be enhanced to take order into account, etc.
     * 
     * @param wordsToCompare
     * @param wordsToCompareAgainst
     * @return The score (a percentage); a higher value indicates a stronger
     *         match.
     */
    protected float score(SortedSet wordsToCompare, SortedSet wordsToCompareAgainst) {
        int totalWords = wordsToCompare.size();
        int matchWords = 0;
        for (Iterator words = wordsToCompare.iterator(); words.hasNext();) {
            String word = words.next().toString();
            if (wordsToCompareAgainst.contains(word))
                matchWords++;
        }
        return ((float) matchWords / (float) totalWords) * 100;
    }

    /**
     * Display results to the user.
     * 
     * @param result
     */
    protected void printReport(BidiMap result) {
        final String Dash6 = "------";
        final String Dash10 = "----------";
        final String Dash60 = "------------------------------------------------------------";

        Formatter f = new Formatter();

        // Print header.
        String format = "%-5.5s|%-10.10s|%-60.60s\n";
        Object[] hSep = new Object[] { Dash6, Dash10, Dash60 };
        f.format(format, hSep);
        f.format(format, new Object[] { "Score", "Code", "Term" });
        f.format(format, hSep);

        // Iterate over the result.
        for (MapIterator items = result.inverseBidiMap().mapIterator(); items.hasNext();) {
            ScoredTerm st = (ScoredTerm) items.next();
            String code = (String) items.getValue();

            // Evaluate code
            if (code != null && code.length() > 10)
                code = code.substring(0, 7) + "...";

            // Evaluate term (wrap if necessary)
            String term = st.term;
            if (term != null && term.length() < 60)
                f.format(format, new Object[] { st.score, code, term });
            else {
                String sub = term.substring(0, 60);
                f.format(format, new Object[] { st.score, code, sub });
                int begin = 60;
                int end = term.length();
                while (begin < end) {
                    sub = term.substring(begin, Math.min(begin + 60, end));
                    f.format(format, new Object[] { "", "", sub });
                    begin += 60;
                }
            }
        }
        Util.displayMessage(f.out().toString());
    }

    /**
     * Return the words comprising the given string, in order ignoring
     * duplicates, common separators and punctuation.
     * 
     * @param s
     * @return SortedSet
     */
    @SuppressWarnings("unchecked")
    protected SortedSet toWords(String s) {
        SortedSet words = new TreeSet();
        StringTokenizer st = new StringTokenizer(s, " \t\n\r\f,:+-;");
        while (st.hasMoreTokens())
            words.add(st.nextToken().toLowerCase());
        return words;
    }
}