HDInsight 
Programming
Port 
• HDFS: http://localhost:50070/ 
• Oozie: http://localhost:11000/oozie/v1/ 
admin/status 
• Templeton: http://localhost:50111/ 
templeton/v1/status 
• ODBC: use port 10000 in DSN 
configuration or connection string.
HDFS WebClient 
Nuget Microsoft.NET API for Hadoop WebClient
WebHDFS
List Directory 
var client = new WebHDFSClient(new Uri("http://localhost:50070"),"hadoop");! 
client.GetDirectoryStatus("/").ContinueWith(dl => dl.Result.Directories.ToList().ForEach(d => Console.WriteLine("/" + 
d.PathSuffix)));
Create Directory 
var client = new WebHDFSClient(new Uri("http://localhost:50070"), "hadoop");! 
var created = await client.CreateDirectory("/TEST");! 
Console.WriteLine("True or False, we created the directory " + created.ToString());! 
var deleted = await client.DeleteDirectory("/TEST");! 
Console.WriteLine("True or False, we deleted the directory " + deleted.ToString());
Task Chaining 
client.CreateDirectory("/TEST")! 
.ContinueWith(x => client.CreateFile(@"c:tmpTitles.txt", "/user/hadoop/titles.txt")! 
.ContinueWith(t => Console.WriteLine("new file located at " + t.Result))! 
.ContinueWith(t => client.OpenFile("/user/hadoop/titles.txt")! 
.ContinueWith(! 
resp => resp.Result.Content.ReadAsStringAsync()! 
.ContinueWith(bigString => Console.WriteLine("new file is " + 
bigString.Result.Length + " bytes long"))! 
.ContinueWith(! 
t2 => client.DeleteDirectory("/user/hadoop/titles.txt")! 
.ContinueWith(b => Console.WriteLine("Successfully deleted file."))! 
)! 
)! 
)! 
);
WebHCat 
• Management of HCatalog metadata. 
• Hive job submission. 
• Pig job submission. 
• Map/Reduce job submission. 
• Streaming Map/Reduce job submission.
CreateHive 
using System.Net.Http; 
string outputDir = "basichivejob";! 
var client = new WebHCatHttpClient(new Uri("http://localhost:50111"), "administrator", "", "hadoop");! 
var t1 = client.CreateHiveJob(@"select * from src;", null, null, outputDir, null);! 
t1.Wait();! 
var response = t1.Result;! 
var output = response.Content.ReadAsAsync<JObject>();! 
output.Wait();! 
response.EnsureSuccessStatusCode();! 
string id = output.Result.GetValue("id").ToString();! 
client.WaitForJobToCompleteAsync(id).Wait();
Oozie 
http://hadoopsdk.codeplex.com/wikipage?title=Oozie 
%20Client&referringTitle=Home
.NET MapReduce
MRRunner
Mapper 
public class SqrtMapper : MapperBase! 
{! 
public override void Map(string inputLine, MapperContext context)! 
{! 
int inputValue = int.Parse(inputLine);! 
! 
// Perform the work.! 
double sqrt = Math.Sqrt((double)inputValue);! 
! 
// Write output data.! 
context.EmitKeyValue(inputValue.ToString(), sqrt.ToString());! 
}! 
}
Hadoop Job 
public class FirstJob : HadoopJob<Mapper,Combiner,Reducer>! 
{! 
public override HadoopJobConfiguration Configure(ExecutorContext context)! 
{! 
HadoopJobConfiguration config = new HadoopJobConfiguration();! 
config.InputPath = "input/SqrtJob";! 
config.OutputFolder = "output/SqrtJob";! 
return config;! 
}! 
}!
var hadoop = Hadoop.Connect(); 
hadoop.MapReduceJob.ExecuteJob<JobType>(arguments);
MRRunner -dll MyMRProgram.dll {-class jobClass} {-- job-class options}
Linq to Hive
HiveRow 
public class TitlesRow : HiveRow! 
{! 
public string MovieId { get; set; }! 
public string Name { get; set; }! 
public int Year { get; set; }! 
public string Rating { get; set; }! 
}! 
! 
public class AwardsRow : HiveRow! 
{! 
public string MovieId { get; set; }! 
public string AwardId { get; set; }! 
public int Year { get; set; }! 
public string Won { get; set; }! 
public string Type { get; set; }! 
public string Category { get; set; }! 
}! 
! 
public class ActorsRow : HiveRow! 
{! 
public string MovieId { get; set; }! 
public string ActorId { get; set; }! 
public int AwardsCount { get; set; }! 
public string Name { get; set; }!
HiveConnection 
public class MyHiveDatabase : HiveConnection! 
{! 
public MyHiveDatabase(Uri webHcatUri, string username, string password, string azureStorageAccount, string azureStorageKey) : base(webHcatUri, 
username, password, azureStorageAccount, azureStorageKey) { }! 
! 
public HiveTable<AwardsRow> Awards! 
{! 
get! 
{! 
return this.GetTable<AwardsRow>("Awards");! 
}! 
}! 
! 
public HiveTable<TitlesRow> Titles! 
{! 
get! 
{! 
return this.GetTable<TitlesRow>("Titles");! 
}! 
}! 
! 
public HiveTable<ActorsRow> Actors! 
{! 
get! 
{! 
return this.GetTable<ActorsRow>("Actors");! 
}! 
}! 
}
Simple Linq 
var db = new MyHiveDatabase(! 
webHCatUri: new Uri("http://localhost:50111"),! 
userName: "hadoop", password: null,! 
storageAccount: “ASV storage account name”, storageKey: “ASV storage account key”);! 
! 
var q = from x in! 
(from a in db.Actors! 
select new { a.ActorId, foo = a.AwardsCount })! 
group x by x.ActorId into g! 
select new { ActorId = g.Key, bar = g.Average(z => z.foo) };! 
! 
q.ExecuteQuery().Wait();! 
var results1 = q.ToList();! 
!! 
var projectionQuery = from aw in db.Awards! 
join t in db.Titles! 
on aw.MovieId equals t.MovieId! 
where t.Year == 1994 && aw.Won == "True"! 
select new { MovieId = t.MovieId, Name = t.Name, Type = aw.Type, Category = aw.Category, 
Year = t.Year };! 
!! 
var newTable = projectionQuery.CreateTable("AwardsIn1994");
Excel ODBC
http://www.microsoft.com/en-us/download/details.aspx? 
id=40886
Resource 
• http://hadoopsdk.codeplex.com/ 
• https://github.com/WindowsAzure-Samples/ 
HDInsight-Labs-Preview 
• http://wag.codeplex.com/
Mahout
Machine Learning is programming 
computers to optimize a 
performance criterion using 
example data or past experience
Classification
Clustering
Recommenders
Collaborative Filtering - 
User Based
Collaborative Filtering - 
Item Based
Data 
http://labrosa.ee.columbia.edu/millionsong/tasteprofile 
http://www.grouplens.org/node/12
Mahout Command 
c:appsdistmahout-0.7bin>hadoop jar c:Appsdistmahout-0.7mahout-core-0.7-job.jar 
org.apache.mahout.cf.taste.hadoop.item.RecommenderJob -s SIMILARITY_COOCCURRENCE --input=input/mInput.txt -- 
output=output --usersFile=input/users.txt!

Hd insight programming

  • 1.
  • 2.
    Port • HDFS:http://localhost:50070/ • Oozie: http://localhost:11000/oozie/v1/ admin/status • Templeton: http://localhost:50111/ templeton/v1/status • ODBC: use port 10000 in DSN configuration or connection string.
  • 3.
    HDFS WebClient NugetMicrosoft.NET API for Hadoop WebClient
  • 4.
  • 5.
    List Directory varclient = new WebHDFSClient(new Uri("http://localhost:50070"),"hadoop");! client.GetDirectoryStatus("/").ContinueWith(dl => dl.Result.Directories.ToList().ForEach(d => Console.WriteLine("/" + d.PathSuffix)));
  • 6.
    Create Directory varclient = new WebHDFSClient(new Uri("http://localhost:50070"), "hadoop");! var created = await client.CreateDirectory("/TEST");! Console.WriteLine("True or False, we created the directory " + created.ToString());! var deleted = await client.DeleteDirectory("/TEST");! Console.WriteLine("True or False, we deleted the directory " + deleted.ToString());
  • 7.
    Task Chaining client.CreateDirectory("/TEST")! .ContinueWith(x => client.CreateFile(@"c:tmpTitles.txt", "/user/hadoop/titles.txt")! .ContinueWith(t => Console.WriteLine("new file located at " + t.Result))! .ContinueWith(t => client.OpenFile("/user/hadoop/titles.txt")! .ContinueWith(! resp => resp.Result.Content.ReadAsStringAsync()! .ContinueWith(bigString => Console.WriteLine("new file is " + bigString.Result.Length + " bytes long"))! .ContinueWith(! t2 => client.DeleteDirectory("/user/hadoop/titles.txt")! .ContinueWith(b => Console.WriteLine("Successfully deleted file."))! )! )! )! );
  • 8.
    WebHCat • Managementof HCatalog metadata. • Hive job submission. • Pig job submission. • Map/Reduce job submission. • Streaming Map/Reduce job submission.
  • 9.
    CreateHive using System.Net.Http; string outputDir = "basichivejob";! var client = new WebHCatHttpClient(new Uri("http://localhost:50111"), "administrator", "", "hadoop");! var t1 = client.CreateHiveJob(@"select * from src;", null, null, outputDir, null);! t1.Wait();! var response = t1.Result;! var output = response.Content.ReadAsAsync<JObject>();! output.Wait();! response.EnsureSuccessStatusCode();! string id = output.Result.GetValue("id").ToString();! client.WaitForJobToCompleteAsync(id).Wait();
  • 10.
  • 11.
  • 12.
  • 13.
    Mapper public classSqrtMapper : MapperBase! {! public override void Map(string inputLine, MapperContext context)! {! int inputValue = int.Parse(inputLine);! ! // Perform the work.! double sqrt = Math.Sqrt((double)inputValue);! ! // Write output data.! context.EmitKeyValue(inputValue.ToString(), sqrt.ToString());! }! }
  • 14.
    Hadoop Job publicclass FirstJob : HadoopJob<Mapper,Combiner,Reducer>! {! public override HadoopJobConfiguration Configure(ExecutorContext context)! {! HadoopJobConfiguration config = new HadoopJobConfiguration();! config.InputPath = "input/SqrtJob";! config.OutputFolder = "output/SqrtJob";! return config;! }! }!
  • 15.
    var hadoop =Hadoop.Connect(); hadoop.MapReduceJob.ExecuteJob<JobType>(arguments);
  • 16.
    MRRunner -dll MyMRProgram.dll{-class jobClass} {-- job-class options}
  • 17.
  • 18.
    HiveRow public classTitlesRow : HiveRow! {! public string MovieId { get; set; }! public string Name { get; set; }! public int Year { get; set; }! public string Rating { get; set; }! }! ! public class AwardsRow : HiveRow! {! public string MovieId { get; set; }! public string AwardId { get; set; }! public int Year { get; set; }! public string Won { get; set; }! public string Type { get; set; }! public string Category { get; set; }! }! ! public class ActorsRow : HiveRow! {! public string MovieId { get; set; }! public string ActorId { get; set; }! public int AwardsCount { get; set; }! public string Name { get; set; }!
  • 19.
    HiveConnection public classMyHiveDatabase : HiveConnection! {! public MyHiveDatabase(Uri webHcatUri, string username, string password, string azureStorageAccount, string azureStorageKey) : base(webHcatUri, username, password, azureStorageAccount, azureStorageKey) { }! ! public HiveTable<AwardsRow> Awards! {! get! {! return this.GetTable<AwardsRow>("Awards");! }! }! ! public HiveTable<TitlesRow> Titles! {! get! {! return this.GetTable<TitlesRow>("Titles");! }! }! ! public HiveTable<ActorsRow> Actors! {! get! {! return this.GetTable<ActorsRow>("Actors");! }! }! }
  • 20.
    Simple Linq vardb = new MyHiveDatabase(! webHCatUri: new Uri("http://localhost:50111"),! userName: "hadoop", password: null,! storageAccount: “ASV storage account name”, storageKey: “ASV storage account key”);! ! var q = from x in! (from a in db.Actors! select new { a.ActorId, foo = a.AwardsCount })! group x by x.ActorId into g! select new { ActorId = g.Key, bar = g.Average(z => z.foo) };! ! q.ExecuteQuery().Wait();! var results1 = q.ToList();! !! var projectionQuery = from aw in db.Awards! join t in db.Titles! on aw.MovieId equals t.MovieId! where t.Year == 1994 && aw.Won == "True"! select new { MovieId = t.MovieId, Name = t.Name, Type = aw.Type, Category = aw.Category, Year = t.Year };! !! var newTable = projectionQuery.CreateTable("AwardsIn1994");
  • 21.
  • 22.
  • 34.
    Resource • http://hadoopsdk.codeplex.com/ • https://github.com/WindowsAzure-Samples/ HDInsight-Labs-Preview • http://wag.codeplex.com/
  • 35.
  • 36.
    Machine Learning isprogramming computers to optimize a performance criterion using example data or past experience
  • 38.
  • 39.
  • 40.
  • 41.
  • 42.
  • 43.
  • 44.
    Mahout Command c:appsdistmahout-0.7bin>hadoopjar c:Appsdistmahout-0.7mahout-core-0.7-job.jar org.apache.mahout.cf.taste.hadoop.item.RecommenderJob -s SIMILARITY_COOCCURRENCE --input=input/mInput.txt -- output=output --usersFile=input/users.txt!