How to use Microsoft Web N-gram service
From Data-gov Wiki
|
How to build a Multi-word TagCloud
1. visit their home page for more information,
http://research.microsoft.com/web-ngram
2. check out their beta test service. You should be able to get a page "Microsoft Web N-Gram Service Quick Start" after agreeing the term of use.
http://web-ngram.research.microsoft.com/info/
3. Install "Visual Studio Express" (I started with C# following their default step-by-step instructions. I will do more research on accessing their web service using other languages). It took me a while to get everything installed.
http://www.microsoft.com/express/Windows/
4. Now follow their step-by-step instructions to build my first app. The instructions are great and helpful. I also did a little bit modifications to create shortcut: I skipped the step "Modify the Project Configuration" and put the "userToken" and "ngramModel" parameters in source code.
5. write the code!
using System;
using System.Collections.Generic;
using System.IO;
using System.Text.RegularExpressions;
using System.Linq;
using System.Text;
namespace NGramTest
{
class Program
{
static void Main(string[] args)
{
SegmentTextFromConsole();
}
//experiment 1 console level text segmentation
static void SegmentTextFromConsole()
{
String line = null;
Console.WriteLine("Please type a phrases");
while ((line = Console.ReadLine()) != "")
{
Program p = new Program();
p.DoSegment(line);
p.printSegmentResult();
Console.WriteLine("Please type a phrases");
}
}
///////////////////////////////////////
// local code
string userToken = "<YOUR_USER_TOKEN>";
string ngramModel = "urn:ngram:bing-title:jun09:4";
string[] result_phrases = null;
float[] result_phrases_probability = null;
bool[] result_phrases_ismulti = null;
public void printSegmentResult()
{
Console.WriteLine("-------Segmentation Result----------------");
for (int i = 0; i < result_phrases.Length; i++)
{
if (null == result_phrases[i])
break;
Console.Write("[phrase {0}]: ", i);
Console.Write(result_phrases_ismulti[i]);
Console.Write(" log(p)={0} \t", result_phrases_probability[i]);
Console.Write(result_phrases[i]);
Console.WriteLine();
}
}
public void DoSegment(string line)
{
//look up token
NGramService.LookupServiceClient client = new NGramTest.NGramService.LookupServiceClient();
//remove white space
line = line.Trim();
//pre-segnmentation
string[] args = line.Split(' ');
result_phrases = new string[args.Length];
result_phrases_probability = new float[args.Length];
result_phrases_ismulti = new bool[args.Length];
int index_result_phrases = 0;
for (int i = 0; i < args.Length ; i++)
{
//init
result_phrases[i] = null;
Console.WriteLine("++++++");
//get probabilityof the previous phrase
if (index_result_phrases > 0 && result_phrases[index_result_phrases].Length>0)
{
Console.WriteLine("phrase A: " + result_phrases[index_result_phrases]);
Console.WriteLine(" log(p(A))= " + result_phrases_probability[index_result_phrases]);
}
//get probability of the current word
float probability = client.GetProbability(userToken, ngramModel, args[i]);
Console.WriteLine("phrase B: " + args[i]);
Console.WriteLine(" log(p(B))= " + probability);
if (i == 0)
{
// add the first word to phrase
result_phrases[index_result_phrases] = args[i];
result_phrases_probability[index_result_phrases] = probability;
result_phrases_ismulti[index_result_phrases] = false;
}
else
{
// joint the previous phrase with the current word
String jointPhrase = result_phrases[index_result_phrases]+ " "+args[i];
float jointp = client.GetProbability(userToken, ngramModel, jointPhrase);
Console.WriteLine("phrase AB: " + jointPhrase);
Console.WriteLine(" log(p(AB))= " + jointp);
Console.WriteLine(" log(p(A)*p(B))= " + (result_phrases_probability[index_result_phrases] + probability));
//compute a couple of heuristic features
bool bCanJoin = (result_phrases_probability[index_result_phrases] + probability) < jointp;
bool bHasSignificantProbabilityDifference = Math.Abs(result_phrases_probability[index_result_phrases] - probability) > 4;
bool bIsJoinSignificantEnough = Math.Abs((result_phrases_probability[index_result_phrases] + probability) - jointp) > 1;
bool bStopWord1 = (result_phrases_probability[index_result_phrases] > -2);
bool bStopWord2 = (probability > -2);
if (bCanJoin && !bHasSignificantProbabilityDifference && bIsJoinSignificantEnough && !bStopWord1)
{
//join
result_phrases[index_result_phrases] = jointPhrase;
result_phrases_probability[index_result_phrases] = jointp;
result_phrases_ismulti[index_result_phrases] = true;
}
else
{
//separate
index_result_phrases++;
result_phrases[index_result_phrases] = args[i];
result_phrases_probability[index_result_phrases] = probability;
result_phrases_ismulti[index_result_phrases] = false;
}
}
}
}
}
}
Facts about How to use Microsoft Web N-gram serviceRDF feed
| Dcterms:created | 30 April 2010 + |
| Dcterms:creator | Li Ding + |
| Dcterms:description | this tutorial shows you how to use Microsoft Web N-gram service to build something interesting with example source code. |
| Dcterms:modified | 2010-4-30 |
| Foaf:name | How to use Microsoft Web N-gram service |
| Skos:altLabel | How to use Microsoft Web N-gram service +, how to use microsoft web n-gram service +, and HOW TO USE MICROSOFT WEB N-GRAM SERVICE + |

