ScoreTerm Java

Java Code
/*
 * Copyright: (c) 2004-2009 Mayo Foundation for Medical Education and 
 * Research (MFMER). All rights reserved. MAYO, MAYO CLINIC, and the
 * triple-shield Mayo logo are trademarks and service marks of MFMER.
 *
 * Except as contained in the copyright notice above, or as used to identify 
 * MFMER as the author of this software, the trade names, trademarks, service
 * marks, or product names of the copyright holder shall not be used in
 * advertising, promotion or otherwise in connection with this software without
 * prior written authorization of the copyright holder.
 * 
 * Licensed under the Eclipse Public License, Version 1.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at 
 * 
 * 		http://www.eclipse.org/legal/epl-v10.html
 * 
 */
package org.LexGrid.LexBIG.example;

import java.util.Formatter;
import java.util.Iterator;
import java.util.SortedSet;
import java.util.StringTokenizer;
import java.util.TreeSet;

import org.LexGrid.LexBIG.DataModel.Collections.ResolvedConceptReferenceList;
import org.LexGrid.LexBIG.DataModel.Core.CodingSchemeSummary;
import org.LexGrid.LexBIG.DataModel.Core.CodingSchemeVersionOrTag;
import org.LexGrid.LexBIG.DataModel.Core.ResolvedConceptReference;
import org.LexGrid.LexBIG.Exceptions.LBException;
import org.LexGrid.LexBIG.Impl.LexBIGServiceImpl;
import org.LexGrid.LexBIG.LexBIGService.CodedNodeSet;
import org.LexGrid.LexBIG.LexBIGService.LexBIGService;
import org.LexGrid.LexBIG.LexBIGService.CodedNodeSet.PropertyType;
import org.LexGrid.LexBIG.LexBIGService.CodedNodeSet.SearchDesignationOption;
import org.LexGrid.LexBIG.Utility.ConvenienceMethods;
import org.LexGrid.LexBIG.Utility.Iterators.ResolvedConceptReferencesIterator;
import org.LexGrid.LexBIG.Utility.LBConstants.MatchAlgorithms;
import org.LexGrid.LexBIG.Utility.LBConstants.SortableProperties;
import org.LexGrid.concepts.Entity;
import org.LexGrid.concepts.Presentation;
import org.apache.commons.collections.BidiMap;
import org.apache.commons.collections.MapIterator;
import org.apache.commons.collections.bidimap.TreeBidiMap;

/**
 * Example showing a simple scoring algorithm that evaluates a provided term
 * against available terms in a code system. A cutoff percentage can optionally
 * be provided.
 */
public class ScoreTerm {

    // Inner class used to manage and sort results.
    protected class ScoredTerm implements Comparable {
        String term;
        float score;

        public int compareTo(Object o) {
            if (o instanceof ScoredTerm) {
                ScoredTerm st = (ScoredTerm) o;
                int i = Float.valueOf(st.score).compareTo(Float.valueOf(score));
                return (i != 0) ? i : term.compareTo(st.term);
            }
            return 0;
        }

    }

    public ScoreTerm() {
    }

    /**
     * Program entry point.
     * 
     * @param args
     *            String[]
     */
    public static void main(String[] args) {
        if (args.length < 1) {
            System.out.println("Example: ScoreTerm \"some term to evaluate\"");
            System.out.println("Example: ScoreTerm \"some term to evaluate\" 25%");
            return;
        }
        StringBuffer term = new StringBuffer(args[0]);
        float cutoff = 0;
        if (args.length > 1) {
            // Treat everything but last argument automatically as part of
            // search term. This helps in Linux where it is difficult to enter
            // spaces in the command line.
            int lastIndex = args.length - 1;
            for (int i = 1; i < lastIndex; i++)
                term.append(' ').append(args[i]);

            // Treat the last argument as cutoff if numeric, otherwise
            // consider part of the search term as well (same reason as above).
            String val = args[lastIndex].trim();
            try {
                if (val.endsWith("%"))
                    val = val.substring(0, val.length() - 1);
                cutoff = Float.valueOf(val);
            } catch (NumberFormatException nfe) {
                term.append(' ').append(val);
            }
        }
        try {
            ScoreTerm pgm = new ScoreTerm();
            pgm.run(term.toString(), cutoff);
        } catch (Exception e) {
            Util.displayAndLogError("REQUEST FAILED !!!", e);
        }
    }

    /**
     * Runs the score algorithm for a specific term.
     * 
     * @param term
     *            The text to evaluate.
     * @param score
     *            Lower cutoff (percentage); a value less than or equal to 0
     *            indicates no cutoff.
     * @throws Exception
     */
    public void run(String term, float minScore) throws Exception {
        // Allow the user to pick the target coding scheme.
        // This could also be hardcoded to a specific coding scheme.
        CodingSchemeSummary css = Util.promptForCodeSystem();
        if (css != null) {
            // Determine the set of individual words to compare against.
            SortedSet compareWords = toWords(term);

            // Create a bucket to store results.
            // Sort the results by score (highest score first) and code key.
            BidiMap scoredResult = new TreeBidiMap();

            // Resolve and iterate through matches to score each.
            // For this example, we keep the highest score per coded concept.
            for (ResolvedConceptReferencesIterator matches = resolveConcepts(css, term); matches.hasNext();) {
                // Work in chunks of 100.
                ResolvedConceptReferenceList refs = matches.next(100);
                for (int i = 0; i < refs.getResolvedConceptReferenceCount(); i++) {
                    ResolvedConceptReference ref = refs.getResolvedConceptReference(i);
                    String code = ref.getConceptCode();

                    Entity node = ref.getEntity();
                    Presentation[] allTermsForConcept = node.getPresentation();

                    for (int j = 0; j < allTermsForConcept.length; j++) {
                        Presentation p = allTermsForConcept[j];
                        String text = p.getValue().getContent();
                        float score = score(toWords(text), compareWords);
                        if (score > minScore) {
                            // Check for a previous match on this code for a
                            // different presentation.
                            // If already present save the item of most
                            // relevance.
                            if (scoredResult.containsKey(code)) {
                                ScoredTerm scoredTerm = (ScoredTerm) scoredResult.get(code);
                                if (scoredTerm.score > score)
                                    continue;
                            }
                            ScoredTerm scoredTerm = new ScoredTerm();
                            scoredTerm.term = text;
                            scoredTerm.score = score;
                            scoredResult.put(code, scoredTerm);
                        }
                    }
                }
            }

            // Print the results.
            printReport(scoredResult);
        }
    }

    /**
     * Resolves matching concepts for any word in the given term.
     * 
     * @param css
     *            The code system to search.
     * @param matchWords
     *            The term to match.
     * @return The list of matching references.
     * @throws LBException
     */
    protected ResolvedConceptReferencesIterator resolveConcepts(CodingSchemeSummary css, String query)
            throws LBException {
        // Define a code set over the target terminology and
        // restrict to concepts with matching text based on
        // the provided term.
        LexBIGService lbs = LexBIGServiceImpl.defaultInstance();
        CodingSchemeVersionOrTag csvt = new CodingSchemeVersionOrTag();
        csvt.setVersion(css.getRepresentsVersion());
        CodedNodeSet cns = lbs.getCodingSchemeConcepts(css.getLocalName(), csvt);

        // Restrict the code set.
        cns
                .restrictToMatchingDesignations(query, SearchDesignationOption.ALL, MatchAlgorithms.LuceneQuery.name(),
                        null);

        // Resolve the concepts and assigned text.
        ResolvedConceptReferencesIterator matches = cns.resolve(ConvenienceMethods
                .createSortOptionList(new String[] { SortableProperties.code.name() }), null,
                new PropertyType[] { PropertyType.PRESENTATION });
        return matches;
    }

    /**
     * Returns a score providing a relative comparison of the first set of words
     * against the second.
     * <p>
     * Currently the score is evaluated as a simple percentage based on number
     * of words in the first set that are also in the second (order
     * independent). This could be enhanced to take order into account, etc.
     * 
     * @param wordsToCompare
     * @param wordsToCompareAgainst
     * @return The score (a percentage); a higher value indicates a stronger
     *         match.
     */
    protected float score(SortedSet wordsToCompare, SortedSet wordsToCompareAgainst) {
        int totalWords = wordsToCompare.size();
        int matchWords = 0;
        for (Iterator words = wordsToCompare.iterator(); words.hasNext();) {
            String word = words.next().toString();
            if (wordsToCompareAgainst.contains(word))
                matchWords++;
        }
        return ((float) matchWords / (float) totalWords) * 100;
    }

    /**
     * Display results to the user.
     * 
     * @param result
     */
    protected void printReport(BidiMap result) {
        final String Dash6 = "------";
        final String Dash10 = "----------";
        final String Dash60 = "------------------------------------------------------------";

        Formatter f = new Formatter();

        // Print header.
        String format = "%-5.5s|%-10.10s|%-60.60s\n";
        Object[] hSep = new Object[] { Dash6, Dash10, Dash60 };
        f.format(format, hSep);
        f.format(format, new Object[] { "Score", "Code", "Term" });
        f.format(format, hSep);

        // Iterate over the result.
        for (MapIterator items = result.inverseBidiMap().mapIterator(); items.hasNext();) {
            ScoredTerm st = (ScoredTerm) items.next();
            String code = (String) items.getValue();

            // Evaluate code
            if (code != null && code.length() > 10)
                code = code.substring(0, 7) + "...";

            // Evaluate term (wrap if necessary)
            String term = st.term;
            if (term != null && term.length() < 60)
                f.format(format, new Object[] { st.score, code, term });
            else {
                String sub = term.substring(0, 60);
                f.format(format, new Object[] { st.score, code, sub });
                int begin = 60;
                int end = term.length();
                while (begin < end) {
                    sub = term.substring(begin, Math.min(begin + 60, end));
                    f.format(format, new Object[] { "", "", sub });
                    begin += 60;
                }
            }
        }
        Util.displayMessage(f.out().toString());
    }

    /**
     * Return the words comprising the given string, in order ignoring
     * duplicates, common separators and punctuation.
     * 
     * @param s
     * @return SortedSet
     */
    @SuppressWarnings("unchecked")
    protected SortedSet toWords(String s) {
        SortedSet words = new TreeSet();
        StringTokenizer st = new StringTokenizer(s, " \t\n\r\f,:+-;");
        while (st.hasMoreTokens())
            words.add(st.nextToken().toLowerCase());
        return words;
    }
}
Content

Space Tools

ScoreTerm Java