//大三Java作业, 仅供参考,
// @刘\_学
package cn;
import java.util.ArrayList;
public class CosSimilarity
{
public CosSimilarity()
{
// TODO Auto-generated constructor stub
}
public static void main(String\[\] args)
{
// TODO Auto-generated method stub
String D1 = "I don't loves DataBase but java.";
String D2 = "I don't like CaoJiba but JianXu.";
PorterStemmer s = new PorterStemmer(); // 单词词形规范化
D1 = s.getStemmer(D1);
D2 = s.getStemmer(D2);
String\[\] s1, s2;
s1 = D1.split(" ");
s2 = D2.split(" ");
// 测试进行单词词形规范化 System.out.println(D1);
// System.out.println(D2);
ArrayList a = new ArrayList(); // 创建动态数组,记录不同的单词
ArrayList aNum = new ArrayList(); // 创建动态数组,统计不同的单词各自出现的次数
ArrayList b = new ArrayList();
ArrayList bNum = new ArrayList();
for (int i = 0; i < s1.length; i++) // 将s1复制到动态数组a, 且词频统计数组初始化
{
a.add(s1\[i\]);
aNum.add(i, 1);
}
for (int i = 0; i < a.size() - 1; i++) // 记录a不同单词且统计词频
{
int tem = 1; // -----------------------暂存词频
for (int j = i + 1; j < a.size(); j++)
{
if (a.get(i).equalsIgnoreCase(a.get(j)))
{
tem++;
aNum.set(i, tem);
a.remove(j);
aNum.remove(j);
}
}
}
for (int i = 0; i < s2.length; i++) // 将s2复制到动态数组b, 且词频统计数组初始化
{
b.add(s2\[i\]);
bNum.add(i, 1);
}
for (int i = 0; i < b.size() - 1; i++) // 记录b不同单词且统计词频
{
int tem = 1; // -----------------------暂存词频
for (int j = i + 1; j < b.size(); j++)
{
if (b.get(i).equalsIgnoreCase(b.get(j)))
{
tem++;
bNum.set(i, tem);
b.remove(j);
bNum.remove(j);
}
}
}
double denominator = 0; // 计算W1K×W2K
for (int i = 0; i < a.size(); i++) // 计算W1K×W2K
{
for (int j = 0; j < b.size(); j++)
{
if (a.get(i).equals(b.get(j)))
denominator += ((double) aNum.get(i) \* (double) bNum.get(j));
}
}
double sqW1 = 0, sqW2 = 0; // 计算两个向量的模
for (int i = 0; i < aNum.size(); i++)
{
sqW1 += (double) aNum.get(i) \* (double) aNum.get(i);
}
for (int i = 0; i < bNum.size(); i++)
{
sqW2 += (double) bNum.get(i) \* (double) bNum.get(i);
}
System.out.println("余弦相似度为" + denominator / Math.sqrt(sqW1 \* sqW2)); // 输出结果
}
}
PorterStemmer相关代码,将下列文字copy到记事本,然后后缀修改为java即可
package cn;
import java.io.ByteArrayInputStream;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
public class PorterStemmer
{ private char\[\] b;
private int i,
i\_end,
j, k;
private static final int INC = 50;
public PorterStemmer()
{ b = new char\[INC\];
i = 0;
i\_end = 0;
}
public void add(char ch)
{ if (i == b.length)
{ char\[\] new\_b = new char\[i+INC\];
for (int c = 0; c < i; c++) new\_b\[c\] = b\[c\];
b = new\_b;
}
b\[i++\] = ch;
}
public void add(char\[\] w, int wLen)
{ if (i+wLen >= b.length)
{ char\[\] new\_b = new char\[i+wLen+INC\];
for (int c = 0; c < i; c++) new\_b\[c\] = b\[c\];
b = new\_b;
}
for (int c = 0; c < wLen; c++) b\[i++\] = w\[c\];
}
public String toString() { return new String(b,0,i\_end); }
public int getResultLength() { return i\_end; }
public char\[\] getResultBuffer() { return b; }
private final boolean cons(int i)
{ switch (b\[i\])
{ case 'a': case 'e': case 'i': case 'o': case 'u': return false;
case 'y': return (i==0) ? true : !cons(i-1);
default: return true;
}
}
private final int m()
{ int n = 0;
int i = 0;
while(true)
{ if (i > j) return n;
if (! cons(i)) break; i++;
}
i++;
while(true)
{ while(true)
{ if (i > j) return n;
if (cons(i)) break;
i++;
}
i++;
n++;
while(true)
{ if (i > j) return n;
if (! cons(i)) break;
i++;
}
i++;
}
}
private final boolean vowelinstem()
{ int i; for (i = 0; i <= j; i++) if (! cons(i)) return true;
return false;
}
private final boolean doublec(int j)
{ if (j < 1) return false;
if (b\[j\] != b\[j-1\]) return false;
return cons(j);
}
private final boolean cvc(int i)
{ if (i < 2 || !cons(i) || cons(i-1) || !cons(i-2)) return false;
{ int ch = b\[i\];
if (ch == 'w' || ch == 'x' || ch == 'y') return false;
}
return true;
}
private final boolean ends(String s)
{ int l = s.length();
int o = k-l+1;
if (o < 0) return false;
for (int i = 0; i < l; i++) if (b\[o+i\] != s.charAt(i)) return false;
j = k-l;
return true;
}
private final void setto(String s)
{ int l = s.length();
int o = j+1;
for (int i = 0; i < l; i++) b\[o+i\] = s.charAt(i);
k = j+l;
}
private final void r(String s) { if (m() > 0) setto(s); }
private final void step1()
{ if (b\[k\] == 's')
{ if (ends("sses")) k -= 2; else
if (ends("ies")) setto("i"); else
if (b\[k-1\] != 's') k--;
}
if (ends("eed")) { if (m() > 0) k--; } else
if ((ends("ed") || ends("ing")) && vowelinstem())
{ k = j;
if (ends("at")) setto("ate"); else
if (ends("bl")) setto("ble"); else
if (ends("iz")) setto("ize"); else
if (doublec(k))
{ k--;
{ int ch = b\[k\];
if (ch == 'l' || ch == 's' || ch == 'z') k++;
}
}
else if (m() == 1 && cvc(k)) setto("e");
}
}
private final void step2() { if (ends("y") && vowelinstem()) b\[k\] = 'i'; }
private final void step3() { if (k == 0) return; switch (b\[k-1\])
{
case 'a': if (ends("ational")) { r("ate"); break; }
if (ends("tional")) { r("tion"); break; }
break;
case 'c': if (ends("enci")) { r("ence"); break; }
if (ends("anci")) { r("ance"); break; }
break;
case 'e': if (ends("izer")) { r("ize"); break; }
break;
case 'l': if (ends("bli")) { r("ble"); break; }
if (ends("alli")) { r("al"); break; }
if (ends("entli")) { r("ent"); break; }
if (ends("eli")) { r("e"); break; }
if (ends("ousli")) { r("ous"); break; }
break;
case 'o': if (ends("ization")) { r("ize"); break; }
if (ends("ation")) { r("ate"); break; }
if (ends("ator")) { r("ate"); break; }
break;
case 's': if (ends("alism")) { r("al"); break; }
if (ends("iveness")) { r("ive"); break; }
if (ends("fulness")) { r("ful"); break; }
if (ends("ousness")) { r("ous"); break; }
break;
case 't': if (ends("aliti")) { r("al"); break; }
if (ends("iviti")) { r("ive"); break; }
if (ends("biliti")) { r("ble"); break; }
break;
case 'g': if (ends("logi")) { r("log"); break; }
} }
private final void step4() { switch (b\[k\])
{
case 'e': if (ends("icate")) { r("ic"); break; }
if (ends("ative")) { r(""); break; }
if (ends("alize")) { r("al"); break; }
break;
case 'i': if (ends("iciti")) { r("ic"); break; }
break;
case 'l': if (ends("ical")) { r("ic"); break; }
if (ends("ful")) { r(""); break; }
break;
case 's': if (ends("ness")) { r(""); break; }
break;
} }
private final void step5()
{ if (k == 0) return; switch (b\[k-1\])
{ case 'a': if (ends("al")) break; return;
case 'c': if (ends("ance")) break;
if (ends("ence")) break; return;
case 'e': if (ends("er")) break; return;
case 'i': if (ends("ic")) break; return;
case 'l': if (ends("able")) break;
if (ends("ible")) break; return;
case 'n': if (ends("ant")) break;
if (ends("ement")) break;
if (ends("ment")) break;
if (ends("ent")) break; return;
case 'o': if (ends("ion") && j >= 0 && (b\[j\] == 's' || b\[j\] == 't')) break;
if (ends("ou")) break; return;
case 's': if (ends("ism")) break; return;
case 't': if (ends("ate")) break;
if (ends("iti")) break; return;
case 'u': if (ends("ous")) break; return;
case 'v': if (ends("ive")) break; return;
case 'z': if (ends("ize")) break; return;
default: return;
}
if (m() > 1) k = j;
}
private final void step6()
{ j = k;
if (b\[k\] == 'e')
{ int a = m();
if (a > 1 || a == 1 && !cvc(k-1)) k--;
}
if (b\[k\] == 'l' && doublec(k) && m() > 1) k--;
}
public void stem()
{ k = i - 1;
if (k > 1) { step1(); step2(); step3(); step4(); step5(); step6(); }
i\_end = k+1; i = 0;
}
public String getStemmer(String originaltext){
String stemtext="";
char\[\] w = new char\[501\];
// Stemmer s = new Stemmer();
// for (int i = 0; i < args.length; i++)
// try
// {
// FileInputStream in = new FileInputStream(args\[i\]);
InputStream in = new ByteArrayInputStream(originaltext.getBytes());
try
{ while(true)
{ int ch = in.read();
if (Character.isLetter((char) ch))
{
int j = 0;
while(true)
{ ch = Character.toLowerCase((char) ch);
w\[j\] = (char) ch;
if (j < 500) j++;
ch = in.read();
if (!Character.isLetter((char) ch))
{
for (int c = 0; c < j; c++) this.add(w\[c\]);
this.stem();
{ String u;
u = toString();
// System.out.print(u);
stemtext+=u;
}
break;
}
}
}
if (ch < 0) break;
// System.out.print((char)ch);
stemtext+=String.valueOf((char)ch);
}
}
catch (IOException e){
// System.out.println("error reading " + args\[i\]);
// break;
}
// }
// catch (FileNotFoundException e)
// { System.out.println("file " + args\[i\] + " not found");
// break;
// }
return stemtext;
}
public static void main(String\[\] args)
{
PorterStemmer s = new PorterStemmer();
// System.out.println(s.getStemmer("Test program for demonstrating the Stemmer. It reads text from a list of files, stems each word, and writes the result to standard output. Note that the word stemmed is expected to be in lower case: forcing lower case must be done outside the Stemmer class."));
System.out.println(s.getStemmer("parallel computer"));
System.out.println(s.getStemmer("parallel computing"));
System.out.println(s.getStemmer("pens"));
System.out.println(s.getStemmer("pen"));
}
}