既然R语言的程序已经能运行了,那直接把唐诗的也统计一下。(有空还是用C++写一下,R非常简洁,但是判断不是很精确。)
l = scan("tangshi.txt", "character", sep = "\n");
l.len = nchar(l);
# 某些行是作者和标题,所以选取长度大于10的行;
# 另外这个文本文件不太规整,有些网址什么的,
# 所以也要排除那些长度太长的。
ci = l[l.len > 10 & l.len < 500];
# 句子用标点符号分割。
sentences = strsplit(ci, ",|。|!|?|、");
sentences = unlist(sentences);
sentences = sentences[sentences != ""];
s.len = nchar(sentences);
#获取的分词的长度
group = 2;
# 单句太长了说明有可能是错误的字符,去除掉。
sentences = sentences[s.len <= 10 & s.len >=group];
s.len = nchar(sentences);
# 暴力挨个拆分,比如“犹解嫁东风”的所有二字组合为
# “犹解”“解嫁”“嫁东”“东风”,
# 无意义的词其频数自然就落在后面了。
splitwords = function(x, x.len) substring(x, 1:(x.len+1 - group), group:x.len);
words = mapply(splitwords, sentences, s.len, SIMPLIFY = TRUE, USE.NAMES = FALSE);
words = unlist(words);
words.freq = table(words);
words.freq = sort(words.freq, decreasing = TRUE);
df<-data.frame(Word = names(words.freq[1:100]), Freq = as.integer(words.freq[1:100]));
write.table(df, "1.txt");
两个词
"Word" "Freq"
"1" "何处" 1653
"2" "不知" 1457
"3" "万里" 1439
"6" "千里" 1294
"7" "今日" 1150
"8" "不见" 1139
"9" "不可" 1133
"10" "春风" 1118
"11" "白云" 1099
"12" "不得" 942
"13" "明月" 888
"14" "人间" 879
"15" "无人" 869
"16" "风吹" 831
"17" "故人" 784
"18" "惆怅" 768
"19" "秋风" 745
"20" "悠悠" 733
"21" "相思" 723
"22" "长安" 721
"23" "白日" 687
"24" "如何" 683
"25" "十年" 674
"26" "青山" 662
"27" "何人" 655
"28" "少年" 628
"29" "相逢" 627
"30" "平生" 585
"31" "寂寞" 584
"32" "天子" 584
"33" "天地" 581
"34" "黄金" 578
"35" "年年" 578
"36" "人不" 576
"37" "何事" 573
"38" "江上" 555
"39" "流水" 548
"40" "回首" 531
"41" "可怜" 531
"42" "主人" 521
"43" "如此" 520
"44" "白发" 516
"45" "今朝" 513
"46" "从此" 503
"47" "日月" 502
"48" "月明" 502
"49" "行人" 500
"50" "落日" 493
"51" "不如" 492
"52" "将军" 492
"53" "归去" 489
"54" "日暮" 482
"55" "别离" 478
"56" "洛阳" 476
"57" "不能" 471
"58" "此时" 470
"59" "天下" 470
"60" "何时" 469
"61" "无事" 467
"62" "芳草" 466
"63" "江南" 463
"64" "相见" 462
"65" "归来" 461
"66" "夕阳" 458
"67" "当时" 454
"68" "杨柳" 451
"69" "风雨" 448
"70" "》)" 445
"71" "东风" 436
"72" "洞庭" 433
"73" "青云" 432
"74" "花落" 428
"75" "参差" 427
"76" "天涯" 426
"77" "芙蓉" 425
"78" "落花" 424
"79" "清风" 421
"80" "不是" 416
"81" "烟霞" 416
"82" "三十" 414
"83" "白头" 413
"84" "桃花" 411
"85" "不相" 410
"86" "唯有" 407
"87" "何如" 404
"88" "南山" 397
"89" "谁能" 395
"90" "君不" 394
"91" "千年" 391
"92" "天上" 389
"93" "如今" 385
"94" "花开" 382
"95" "桃李" 380
"96" "与君" 380
"97" "此地" 378
"98" "殷勤" 378
"99" "浮云" 376
"100" "君王" 375
3个词
"Word" "Freq"
"6" "君不见" 224
"11" "不知何" 127
"13" "行路难" 108
"14" "三千里" 108
"17" "不可见" 100
"22" "知何处" 90
"23" "在何处" 89
"24" "二十年" 87
"28" "三十六" 85
"30" "三十年" 75
"31" "无消息" 74
"32" "不相见" 73
"33" "何处去" 70
"34" "无一事" 70
"35" "洛阳城" 69
"36" "千万里" 69
"38" "何处是" 68
"40" "水东流" 67
"44" "归未得" 65
"45" "向人间" 63
"46" "歌一曲" 62
"49" "千里外" 61
"50" "一杯酒" 61
"52" "明月夜" 58
"53" "归何处" 57
"54" "从此去" 56
"55" "东风吹" 56
"56" "今何在" 55
"57" "皮日休" 55
"58" "人不知" 55
"59" "春风吹" 54
"61" "不知谁" 53
"62" "草萋萋" 53
"63" "归去来" 53
"64" "不得意" 52
"65" "人不见" 52
"66" "无人知" 52
"67" "长安道" 52
"68" "复何如" 51
"69" "人间事" 51
"70" "与君同" 51