SlideShare a Scribd company logo
1 of 10
Download to read offline
import pandas as pd
data=pd.read_csv("/content/DataSet.xlsx - Feuil1 (1).csv", encoding =
"ISO-8859-1")
data.head()
Job title Company
name 
0 Développeur Dynamics 365
AIVENTU
1 Manager Development NeoXam
Tunisia
2 Développeur ios/swift (+ 3ans d⠙expérience) Asr
€
développement
3 Expert Technique en développement Java JSE (J...
NeoXam
4 Senior Java Development Engineer NeoXam
Tunisia
Job function Job type 
0 Engineering Full-time
1 Information Technology Full-time
2 IOS â “ Swift development Full-time
€
3 Design and development of solutions, technical... Full-time
4 Participating in designing and analyzing solut... Full-time
Industry sector Job recruiter ID 
0 Information Technology and Services 1204
1 Information Technology and Services req557
2 Not specified Not specified
3 Banking and finance. Not provided
4 Information Technology and Services req439
Job website 
0 NaN
1 neoxam.csod.com/ux/ats/careersite/6/home?c=neo...
2 NaN
3 Not provided
4 neoxam.csod.com/ux/ats/care
Job location Application deadline 
0 Les Berges du Lac, Tunis, Tunisia 31/7/2023
1 Tunis, Tunisia 7/7/2023
2 Sousse, Tunisia 30/6/2023
3 Tunis, Tunisia 22/6/2023
4 Tunis, Tunisia 22/6/2023
Required skills 
0 C# or X++, VisualStudio, .NET Framework, SQL S...
1 Java or any other object-oriented language, Li...
2 IOS â “ Swift development,
€
3 Java development, Design Patterns, Web service...
4 Design Pattern, Webservices SOAP/REST, Java Me...
Required education 
0 Bac+5 in computer science or equivalent
1 Bac+5 in engineering or equivalent
2 Not specified
3 Master's degree in computer engineering or equ...
4 Engineering degree or equivalent (BAC +5 or eq...
Required language proficiency 
0 Good level of French and English (spoken and w...
1 Fluent in French and English (both oral and wr...
2 Not specified
3 Good command of written and spoken English and...
4 English and French (oral and written)
Contact email for job application
0 swissal@aiventu.com
1 not specified
2 not specified
3 Not provided
4 not specified
# Drop the 'Job recruiter ID', 'Job website', and 'Contact email for
job application' columns
data = data.drop(['Application deadline','Job title','Job
function','Job location','Job type','Company name','Job recruiter ID',
'Job website', 'Contact email for job application','Job type'],
axis=1)
data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 129 entries, 0 to 128
Data columns (total 4 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Industry sector 121 non-null object
1 Required skills 120 non-null object
2 Required education 114 non-null object
3 Required language proficiency 121 non-null object
dtypes: object(4)
memory usage: 4.2+ KB
data['Required skills'] = data['Required skills'].fillna('')
import spacy
from spacy.matcher import PhraseMatcher
# Load the English language model for spaCy
nlp = spacy.load('en_core_web_sm')
# Define a list of skill phrases to match
sql_skills =
["sql","PL.SQL","mysql","MS.SQL","SQL.Server","SQLite","PostgreSQL","R
ubinius","T.SQL","mariadb","vsql"]
nosql_skills=["CouchDB","cassandra","mongo","hbase","redis","Scylla","
Neo4j","DynamoDB","Memcached","Bigtable","Hypertable","Oracle","NoSQL"
]
js_back_fram_skills=["javascript","Backbone","Express.js","kraken.js",
"Hapi.js","Koa.js","TotalJS","Nest.js","Sails.Js","Meteor.Js","LoopBac
k","Derby.Js","Adonis.Js","Mojito","Keystone.Js","Feathers.Js","Restif
y.Js","ActionHero.Js","Sequelize","Moleculer"]
js_front_framewrok=["javascript","angular","vue.js","react","ember","j
query","Prototype","ext.js"]
javascript_skills= ["node.js","javascript","babel","npm","eslint",
"typescript" ]
sys_skills=["linux","unix","ubuntu","redhat","SuSE","Debian","fedora",
"Bash","nginx","apache","systemctl","bash","shell","systemd","ssh","ne
twork"]
c_skills=["sCs","C+
+","C#","wxWidgets","JUCE","CEGUI","CEF","GTK","Qt"]
xml_skills=["xml","html"]
css_skills=["css","bootstrap","sass","postcss"]
rest_skills=["soa","soap","REST","http","ajax"]
git_skills=["git","svn"]
java_skills=["Java","JEE","Spring","Struts","Hibernate","Wicket","JSF"
,"Dropwizard","Grails","ATG","maven","spring.boot","spring.security"]
continus_build_skills=["Buddy","git","Jenkins","TeamCity","GoCD","Bamb
oo","CircleCI","Codeship","Buildbot","Nevercode","Integrity","Strider"
,"Autorabit","Buildkite","Semaphore","CruiseControl","Urbancode"]
virt_skills=["Docker","Vagrant","Wox","Rancher","Kubernetes","Mesos","
LXC","OpenVZ", "kvm"]
cloud_skills= ["Cloud",
"Computing","AWS","google.cloud","Bluemix","OVH","Joyent","Microsoft.A
zure","Cloudwatt","Ikoula","Rackspace","Nimbus","Niftyname","OpenStack
","OpenNebula","Eucalyptus","vultr","DigitalOcean"]
php_skills=["php","symfony","CodeIgniter","Agavi","CakePHP","Dframe","
Flight","FuelPHP","Hoa","Horde","Jelix","KumbiaPHP","Laravel","Laminas
","Mkframework","MODx","PEAR","WebSite.PHP","Zend","YAF"]
cms_skills=["drupal","wordpress","joomla","squarespace","magneto"]
scrum_skills=
["Zimbra","confluence","Slack","Scrum","Wrike","Agile","Trello","JIRA"
,"Assembla","nTask","Targetprocess","Asana","Clarizen","QuickScrum","S
crumDo","VivifyScrum","scrumban","Kanban","Waterfall","GitScrum"]
testing_skills =
["Testrail","Zephyr","JMeter","TestLink","Selenium","QTP","SoapUI","Tr
icentis.Tosca","Telerik","Katalon
Studio","UFT","IBM.RFT","Ranorex","Postman"]
back_skills=javascript_skills + sql_skills + nosql_skills+["node.js"]
front_skills=["webpack"]
teck_skills=["micro-services"]
po_skills=["Team Building"]
big_data_skills = ["Business
Analytic","Mining","Warehous","décisionnelle","Splunk","Scala","Julia"
,
"Fluentd","Grafana","Memcached","KNIME","Statistica","Excel",
"dashboarding","big data","hadoop", "Spark", "Storm",
"RapidMiner","SAMOA","HPCC", "Quoble", "Hive", "Cloudera",
"Openrefine", "Teradata","kafka"]
networking_skills = ["cisco","pfsense","san", "sdnss", "dhcp", "
snats", "huawei","Troubleshooting","ccna","socket","sip
s","tcp","udp","ssh","telnet","slans", "vpn", "Packet Tracer"]
bi_skills = ["SAP","Sisense","bods","Talend",
"warehouse","QlikSense","Power.BI","Looker","Tableau","zoho","ssis","s
sas","ssrs"]
embarque_skills = ["raspberry","Stm","assembleur","arduino"]
matlab_skills =["matlab"]
security_skills =
["ips","audit","Symantec","Metasploit","Hacking","Vulnera","Phishing",
"Malware"]
telecom_skills = ["voip","gsm"]
python_skills
=["python","Tkinter","CherryPy","Django","Flask" ,"Pylons", "Pyramid",
"Pylons", "Web2py","BeautifulSoup"]
deep_skills = ["deep", "tensorflow", "Recurrent Neural","RNN",
"Keras", "py.torsh","caffe", "convolution neural network", "cnn"]
ml_skills = ["Learning","classification","regression",
"clustering" ,"Computer vision","srilm", "theano","Lasagne", "scikit",
"Anaconda", "scrap", "ADAS"]
robo_skills =["kuka","linx","winautomation","automai"]
elastic_search_skills = ["Elasticsearch", "Logstash", "Kibana"]
r_skills = ["sRs"]
#analytic_skills =["Matplotlib", "nltk","Bokeh","gensim"]
# TODO: update list
skill_phrases = [sql_skills, nosql_skills, js_back_fram_skills,
js_front_framewrok, javascript_skills, sys_skills, c_skills,
xml_skills, css_skills,
rest_skills, git_skills, java_skills, continus_build_skills,
virt_skills, cloud_skills, php_skills, cms_skills, scrum_skills,
testing_skills,
networking_skills, big_data_skills, bi_skills, embarque_skills,
matlab_skills, security_skills, telecom_skills, python_skills,
deep_skills, ml_skills,
robo_skills,elastic_search_skills, r_skills ]
all_skills = set([j.lower().strip() for i in skill_phrases for j in
i])
reg_all_skills = "("+"|".join(all_skills)+")"
import re
# Define the skills
# Create a regular expression pattern to match any of the skills
# Apply the pattern to the Body column and extract the skills
data['extracted_skills'] = data['Required
skills'].str.findall(reg_all_skills, flags=re.IGNORECASE).apply(set)
data.head()
Industry sector 
0 Information Technology and Services
1 Information Technology and Services
2 Not specified
3 Banking and finance.
4 Information Technology and Services
Required skills 
0 C# or X++, VisualStudio, .NET Framework, SQL S...
1 Java or any other object-oriented language, Li...
2 IOS â “ Swift development,
€
3 Java development, Design Patterns, Web service...
4 Design Pattern, Webservices SOAP/REST, Java Me...
Required education 
0 Bac+5 in computer science or equivalent
1 Bac+5 in engineering or equivalent
2 Not specified
3 Master's degree in computer engineering or equ...
4 Engineering degree or equivalent (BAC +5 or eq...
Required language proficiency 
0 Good level of French and English (spoken and w...
1 Fluent in French and English (both oral and wr...
2 Not specified
3 Good command of written and spoken English and...
4 English and French (oral and written)
extracted_skills
0 {C#, SQL Server}
1 {PostgreSQL, Linux, cloud, KANBAN, SQL Server,...
2 {}
3 {REST, HTML, SOAP, Java, CSS}
4 {REST, Unix, SQL Server, HTML, Oracle, SOAP, J...
from sklearn.feature_extraction.text import CountVectorizer
# Convert the extracted skills into a list of strings
skill_strings = data['extracted_skills'].apply(lambda x: ' '.join(x))
# Create an instance of CountVectorizer with binary option
vectorizer = CountVectorizer(binary=True)
# Fit and transform the skill strings into a matrix of one hot encoded
features
skill_features = vectorizer.fit_transform(skill_strings)
# Convert the matrix into a pandas DataFrame for easy manipulation
skill_df = pd.DataFrame(skill_features.toarray(),
columns=vectorizer.get_feature_names_out())
skill_df.head(10)
agile angular aws babel bamboo bi bootstrap cloud computing
css 
0 0 0 0 0 0 0 0 0 0
0
1 0 0 1 0 0 0 0 1 0
0
2 0 0 0 0 0 0 0 0 0
0
3 0 0 0 0 0 0 0 0 0
1
4 0 0 0 0 0 0 0 0 0
1
5 0 0 0 0 0 0 0 0 0
1
6 0 1 0 0 0 0 0 0 0
1
7 0 0 0 0 0 0 0 0 0
0
8 0 0 0 0 1 0 0 0 0
0
9 0 0 0 0 1 0 0 0 0
0
... symfony tcp typescript ubuntu unix voip warehous
warehouse 
0 ... 0 0 0 0 0 0 0
0
1 ... 0 0 0 0 0 0 0
0
2 ... 0 0 0 0 0 0 0
0
3 ... 0 0 0 0 0 0 0
0
4 ... 0 0 0 0 1 0 0
0
5 ... 0 0 0 0 1 0 0
0
6 ... 0 0 0 0 0 0 0
0
7 ... 0 0 0 0 0 0 0
0
8 ... 0 0 0 0 0 0 0
0
9 ... 0 0 0 0 1 0 0
0
wordpress xml
0 0 0
1 0 0
2 0 0
3 0 0
4 0 0
5 0 0
6 0 0
7 0 0
8 0 1
9 0 1
[10 rows x 74 columns]
from sklearn.cluster import KMeans
# Perform KMeans clustering
kmeans_model = KMeans(n_clusters=15, init='k-means++', max_iter=100,
n_init=1, random_state=42)
kmeans_output = kmeans_model.fit_predict(skill_df)
kmeans_output
array([ 1, 2, 1, 4, 7, 7, 14, 1, 3, 1, 3, 3, 3, 3, 1, 1,
1,
14, 1, 1, 4, 12, 11, 11, 1, 14, 14, 14, 1, 1, 14, 14, 1,
1,
9, 5, 9, 1, 1, 2, 1, 1, 1, 1, 1, 1, 6, 1, 1, 1,
4,
1, 14, 1, 1, 1, 1, 0, 1, 5, 1, 1, 11, 1, 1, 1, 1,
14,
1, 1, 1, 14, 1, 1, 13, 1, 11, 5, 1, 1, 1, 1, 1, 6,
1,
1, 1, 1, 11, 4, 4, 8, 10, 1, 1, 1, 5, 1, 14, 14, 1,
1,
0, 1, 1, 14, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1,
1, 1, 14, 4, 11, 1, 1, 1, 4, 1], dtype=int32)
kmeans_output.shape[0]
129
# Get the cluster centers and feature names
cluster_centers = kmeans_model.cluster_centers_
feature_names = skill_df.columns
# Print the top 5 features for each cluster
for i, center in enumerate(cluster_centers):
sorted_indices = center.argsort()[::-1][:5]
print(f"Cluster {i}:")
for idx in sorted_indices:
print(f"t{feature_names[idx]}: {center[idx]}")
Cluster 0:
python: 1.0
sql: 1.0
hadoop: 0.5
learning: 0.5
nosql: 0.5
Cluster 1:
agile: 0.10126582278480999
sql: 0.07594936708860747
excel: 0.0632911392405063
react: 0.05063291139240497
grafana: 0.03797468354430377
Cluster 2:
postgresql: 1.0
linux: 1.0
java: 1.0
cloud: 1.0
server: 1.0
Cluster 3:
bamboo: 1.0000000000000002
xml: 1.0
linux: 1.0
jenkins: 1.0
sql: 1.0
Cluster 4:
java: 1.0
agile: 0.857142857142857
scrum: 0.7142857142857143
html: 0.7142857142857142
css: 0.5714285714285714
Cluster 5:
network: 1.0
linux: 1.0
cloud: 0.5
ubuntu: 0.25
tcp: 0.25
Cluster 6:
css: 1.0
jquery: 1.0
react: 1.0
git: 1.0
server: 1.0
Cluster 7:
css: 1.0
oracle: 1.0
html: 1.0
java: 1.0
server: 1.0
Cluster 8:
mysql: 1.0
js: 1.0
git: 1.0
shell: 1.0
jquery: 1.0
Cluster 9:
xml: 1.0
sql: 1.0
java: 1.0
oracle: 1.0
server: 1.0
Cluster 10:
mysql: 1.0
bootstrap: 1.0
java: 1.0
react: 1.0
jquery: 1.0
Cluster 11:
docker: 1.0
git: 0.8333333333333333
kubernetes: 0.49999999999999994
linux: 0.33333333333333337
agile: 0.3333333333333333
Cluster 12:
hibernate: 1.0
postgresql: 1.0
redis: 1.0
ips: 1.0
java: 1.0
Cluster 13:
maven: 1.0
js: 1.0
react: 1.0
java: 1.0
ext: 1.0
Cluster 14:
java: 0.857142857142857
angular: 0.5714285714285714
react: 0.35714285714285704
rest: 0.21428571428571427
html: 0.14285714285714288

More Related Content

Similar to vertopal.com_DataEncodingForDataClustering-5 (1).pdf

Evolving a Clean, Pragmatic Architecture at JBCNConf 2019
Evolving a Clean, Pragmatic Architecture at JBCNConf 2019Evolving a Clean, Pragmatic Architecture at JBCNConf 2019
Evolving a Clean, Pragmatic Architecture at JBCNConf 2019
Victor Rentea
 
The Art Of Readable Code
The Art Of Readable CodeThe Art Of Readable Code
The Art Of Readable Code
Baidu, Inc.
 
Evolving your Data Access with MongoDB Stitch
Evolving your Data Access with MongoDB StitchEvolving your Data Access with MongoDB Stitch
Evolving your Data Access with MongoDB Stitch
MongoDB
 

Similar to vertopal.com_DataEncodingForDataClustering-5 (1).pdf (20)

Evolving a Clean, Pragmatic Architecture at JBCNConf 2019
Evolving a Clean, Pragmatic Architecture at JBCNConf 2019Evolving a Clean, Pragmatic Architecture at JBCNConf 2019
Evolving a Clean, Pragmatic Architecture at JBCNConf 2019
 
Session 2 django material for training at baabtra models
Session 2 django material for training at baabtra modelsSession 2 django material for training at baabtra models
Session 2 django material for training at baabtra models
 
Learn D3.js in 90 minutes
Learn D3.js in 90 minutesLearn D3.js in 90 minutes
Learn D3.js in 90 minutes
 
Ember.js Tokyo event 2014/09/22 (English)
Ember.js Tokyo event 2014/09/22 (English)Ember.js Tokyo event 2014/09/22 (English)
Ember.js Tokyo event 2014/09/22 (English)
 
The Art Of Readable Code
The Art Of Readable CodeThe Art Of Readable Code
The Art Of Readable Code
 
Advanced Php - Macq Electronique 2010
Advanced Php - Macq Electronique 2010Advanced Php - Macq Electronique 2010
Advanced Php - Macq Electronique 2010
 
Evolving your Data Access with MongoDB Stitch
Evolving your Data Access with MongoDB StitchEvolving your Data Access with MongoDB Stitch
Evolving your Data Access with MongoDB Stitch
 
Get started with R lang
Get started with R langGet started with R lang
Get started with R lang
 
CGI.ppt
CGI.pptCGI.ppt
CGI.ppt
 
Scalable web application architecture
Scalable web application architectureScalable web application architecture
Scalable web application architecture
 
Automate ml workflow_transmogrif_ai-_chetan_khatri_berlin-scala
Automate ml workflow_transmogrif_ai-_chetan_khatri_berlin-scalaAutomate ml workflow_transmogrif_ai-_chetan_khatri_berlin-scala
Automate ml workflow_transmogrif_ai-_chetan_khatri_berlin-scala
 
Beyond php it's not (just) about the code
Beyond php   it's not (just) about the codeBeyond php   it's not (just) about the code
Beyond php it's not (just) about the code
 
RDataMining slides-r-programming
RDataMining slides-r-programmingRDataMining slides-r-programming
RDataMining slides-r-programming
 
Viktor Tsykunov: Azure Machine Learning Service
Viktor Tsykunov: Azure Machine Learning ServiceViktor Tsykunov: Azure Machine Learning Service
Viktor Tsykunov: Azure Machine Learning Service
 
Prompt engineering for iOS developers (How LLMs and GenAI work)
Prompt engineering for iOS developers (How LLMs and GenAI work)Prompt engineering for iOS developers (How LLMs and GenAI work)
Prompt engineering for iOS developers (How LLMs and GenAI work)
 
Apex Enterprise Patterns: Building Strong Foundations
Apex Enterprise Patterns: Building Strong FoundationsApex Enterprise Patterns: Building Strong Foundations
Apex Enterprise Patterns: Building Strong Foundations
 
DIWE - Advanced PHP Concepts
DIWE - Advanced PHP ConceptsDIWE - Advanced PHP Concepts
DIWE - Advanced PHP Concepts
 
Getting started with Pandas Cheatsheet.pdf
Getting started with Pandas Cheatsheet.pdfGetting started with Pandas Cheatsheet.pdf
Getting started with Pandas Cheatsheet.pdf
 
apurva resume
apurva resumeapurva resume
apurva resume
 
iOS와 케라스의 만남
iOS와 케라스의 만남iOS와 케라스의 만남
iOS와 케라스의 만남
 

Recently uploaded

怎样办理圣地亚哥州立大学毕业证(SDSU毕业证书)成绩单学校原版复制
怎样办理圣地亚哥州立大学毕业证(SDSU毕业证书)成绩单学校原版复制怎样办理圣地亚哥州立大学毕业证(SDSU毕业证书)成绩单学校原版复制
怎样办理圣地亚哥州立大学毕业证(SDSU毕业证书)成绩单学校原版复制
vexqp
 
Jual obat aborsi Bandung ( 085657271886 ) Cytote pil telat bulan penggugur ka...
Jual obat aborsi Bandung ( 085657271886 ) Cytote pil telat bulan penggugur ka...Jual obat aborsi Bandung ( 085657271886 ) Cytote pil telat bulan penggugur ka...
Jual obat aborsi Bandung ( 085657271886 ) Cytote pil telat bulan penggugur ka...
Klinik kandungan
 
Top profile Call Girls In Satna [ 7014168258 ] Call Me For Genuine Models We ...
Top profile Call Girls In Satna [ 7014168258 ] Call Me For Genuine Models We ...Top profile Call Girls In Satna [ 7014168258 ] Call Me For Genuine Models We ...
Top profile Call Girls In Satna [ 7014168258 ] Call Me For Genuine Models We ...
nirzagarg
 
Top profile Call Girls In dimapur [ 7014168258 ] Call Me For Genuine Models W...
Top profile Call Girls In dimapur [ 7014168258 ] Call Me For Genuine Models W...Top profile Call Girls In dimapur [ 7014168258 ] Call Me For Genuine Models W...
Top profile Call Girls In dimapur [ 7014168258 ] Call Me For Genuine Models W...
gajnagarg
 
Jodhpur Park | Call Girls in Kolkata Phone No 8005736733 Elite Escort Service...
Jodhpur Park | Call Girls in Kolkata Phone No 8005736733 Elite Escort Service...Jodhpur Park | Call Girls in Kolkata Phone No 8005736733 Elite Escort Service...
Jodhpur Park | Call Girls in Kolkata Phone No 8005736733 Elite Escort Service...
HyderabadDolls
 
In Riyadh ((+919101817206)) Cytotec kit @ Abortion Pills Saudi Arabia
In Riyadh ((+919101817206)) Cytotec kit @ Abortion Pills Saudi ArabiaIn Riyadh ((+919101817206)) Cytotec kit @ Abortion Pills Saudi Arabia
In Riyadh ((+919101817206)) Cytotec kit @ Abortion Pills Saudi Arabia
ahmedjiabur940
 
Top profile Call Girls In Purnia [ 7014168258 ] Call Me For Genuine Models We...
Top profile Call Girls In Purnia [ 7014168258 ] Call Me For Genuine Models We...Top profile Call Girls In Purnia [ 7014168258 ] Call Me For Genuine Models We...
Top profile Call Girls In Purnia [ 7014168258 ] Call Me For Genuine Models We...
nirzagarg
 
Sealdah % High Class Call Girls Kolkata - 450+ Call Girl Cash Payment 8005736...
Sealdah % High Class Call Girls Kolkata - 450+ Call Girl Cash Payment 8005736...Sealdah % High Class Call Girls Kolkata - 450+ Call Girl Cash Payment 8005736...
Sealdah % High Class Call Girls Kolkata - 450+ Call Girl Cash Payment 8005736...
HyderabadDolls
 
Top profile Call Girls In Hapur [ 7014168258 ] Call Me For Genuine Models We ...
Top profile Call Girls In Hapur [ 7014168258 ] Call Me For Genuine Models We ...Top profile Call Girls In Hapur [ 7014168258 ] Call Me For Genuine Models We ...
Top profile Call Girls In Hapur [ 7014168258 ] Call Me For Genuine Models We ...
nirzagarg
 
Top profile Call Girls In Indore [ 7014168258 ] Call Me For Genuine Models We...
Top profile Call Girls In Indore [ 7014168258 ] Call Me For Genuine Models We...Top profile Call Girls In Indore [ 7014168258 ] Call Me For Genuine Models We...
Top profile Call Girls In Indore [ 7014168258 ] Call Me For Genuine Models We...
gajnagarg
 

Recently uploaded (20)

怎样办理圣地亚哥州立大学毕业证(SDSU毕业证书)成绩单学校原版复制
怎样办理圣地亚哥州立大学毕业证(SDSU毕业证书)成绩单学校原版复制怎样办理圣地亚哥州立大学毕业证(SDSU毕业证书)成绩单学校原版复制
怎样办理圣地亚哥州立大学毕业证(SDSU毕业证书)成绩单学校原版复制
 
DATA SUMMIT 24 Building Real-Time Pipelines With FLaNK
DATA SUMMIT 24  Building Real-Time Pipelines With FLaNKDATA SUMMIT 24  Building Real-Time Pipelines With FLaNK
DATA SUMMIT 24 Building Real-Time Pipelines With FLaNK
 
Discover Why Less is More in B2B Research
Discover Why Less is More in B2B ResearchDiscover Why Less is More in B2B Research
Discover Why Less is More in B2B Research
 
Aspirational Block Program Block Syaldey District - Almora
Aspirational Block Program Block Syaldey District - AlmoraAspirational Block Program Block Syaldey District - Almora
Aspirational Block Program Block Syaldey District - Almora
 
Charbagh + Female Escorts Service in Lucknow | Starting ₹,5K To @25k with A/C...
Charbagh + Female Escorts Service in Lucknow | Starting ₹,5K To @25k with A/C...Charbagh + Female Escorts Service in Lucknow | Starting ₹,5K To @25k with A/C...
Charbagh + Female Escorts Service in Lucknow | Starting ₹,5K To @25k with A/C...
 
Jual obat aborsi Bandung ( 085657271886 ) Cytote pil telat bulan penggugur ka...
Jual obat aborsi Bandung ( 085657271886 ) Cytote pil telat bulan penggugur ka...Jual obat aborsi Bandung ( 085657271886 ) Cytote pil telat bulan penggugur ka...
Jual obat aborsi Bandung ( 085657271886 ) Cytote pil telat bulan penggugur ka...
 
Top profile Call Girls In Satna [ 7014168258 ] Call Me For Genuine Models We ...
Top profile Call Girls In Satna [ 7014168258 ] Call Me For Genuine Models We ...Top profile Call Girls In Satna [ 7014168258 ] Call Me For Genuine Models We ...
Top profile Call Girls In Satna [ 7014168258 ] Call Me For Genuine Models We ...
 
Fun all Day Call Girls in Jaipur 9332606886 High Profile Call Girls You Ca...
Fun all Day Call Girls in Jaipur   9332606886  High Profile Call Girls You Ca...Fun all Day Call Girls in Jaipur   9332606886  High Profile Call Girls You Ca...
Fun all Day Call Girls in Jaipur 9332606886 High Profile Call Girls You Ca...
 
Top profile Call Girls In dimapur [ 7014168258 ] Call Me For Genuine Models W...
Top profile Call Girls In dimapur [ 7014168258 ] Call Me For Genuine Models W...Top profile Call Girls In dimapur [ 7014168258 ] Call Me For Genuine Models W...
Top profile Call Girls In dimapur [ 7014168258 ] Call Me For Genuine Models W...
 
Jodhpur Park | Call Girls in Kolkata Phone No 8005736733 Elite Escort Service...
Jodhpur Park | Call Girls in Kolkata Phone No 8005736733 Elite Escort Service...Jodhpur Park | Call Girls in Kolkata Phone No 8005736733 Elite Escort Service...
Jodhpur Park | Call Girls in Kolkata Phone No 8005736733 Elite Escort Service...
 
In Riyadh ((+919101817206)) Cytotec kit @ Abortion Pills Saudi Arabia
In Riyadh ((+919101817206)) Cytotec kit @ Abortion Pills Saudi ArabiaIn Riyadh ((+919101817206)) Cytotec kit @ Abortion Pills Saudi Arabia
In Riyadh ((+919101817206)) Cytotec kit @ Abortion Pills Saudi Arabia
 
Top profile Call Girls In Purnia [ 7014168258 ] Call Me For Genuine Models We...
Top profile Call Girls In Purnia [ 7014168258 ] Call Me For Genuine Models We...Top profile Call Girls In Purnia [ 7014168258 ] Call Me For Genuine Models We...
Top profile Call Girls In Purnia [ 7014168258 ] Call Me For Genuine Models We...
 
Nirala Nagar / Cheap Call Girls In Lucknow Phone No 9548273370 Elite Escort S...
Nirala Nagar / Cheap Call Girls In Lucknow Phone No 9548273370 Elite Escort S...Nirala Nagar / Cheap Call Girls In Lucknow Phone No 9548273370 Elite Escort S...
Nirala Nagar / Cheap Call Girls In Lucknow Phone No 9548273370 Elite Escort S...
 
Sealdah % High Class Call Girls Kolkata - 450+ Call Girl Cash Payment 8005736...
Sealdah % High Class Call Girls Kolkata - 450+ Call Girl Cash Payment 8005736...Sealdah % High Class Call Girls Kolkata - 450+ Call Girl Cash Payment 8005736...
Sealdah % High Class Call Girls Kolkata - 450+ Call Girl Cash Payment 8005736...
 
Top profile Call Girls In Hapur [ 7014168258 ] Call Me For Genuine Models We ...
Top profile Call Girls In Hapur [ 7014168258 ] Call Me For Genuine Models We ...Top profile Call Girls In Hapur [ 7014168258 ] Call Me For Genuine Models We ...
Top profile Call Girls In Hapur [ 7014168258 ] Call Me For Genuine Models We ...
 
Top Call Girls in Balaghat 9332606886Call Girls Advance Cash On Delivery Ser...
Top Call Girls in Balaghat  9332606886Call Girls Advance Cash On Delivery Ser...Top Call Girls in Balaghat  9332606886Call Girls Advance Cash On Delivery Ser...
Top Call Girls in Balaghat 9332606886Call Girls Advance Cash On Delivery Ser...
 
Top profile Call Girls In Indore [ 7014168258 ] Call Me For Genuine Models We...
Top profile Call Girls In Indore [ 7014168258 ] Call Me For Genuine Models We...Top profile Call Girls In Indore [ 7014168258 ] Call Me For Genuine Models We...
Top profile Call Girls In Indore [ 7014168258 ] Call Me For Genuine Models We...
 
Digital Advertising Lecture for Advanced Digital & Social Media Strategy at U...
Digital Advertising Lecture for Advanced Digital & Social Media Strategy at U...Digital Advertising Lecture for Advanced Digital & Social Media Strategy at U...
Digital Advertising Lecture for Advanced Digital & Social Media Strategy at U...
 
Dubai Call Girls Peeing O525547819 Call Girls Dubai
Dubai Call Girls Peeing O525547819 Call Girls DubaiDubai Call Girls Peeing O525547819 Call Girls Dubai
Dubai Call Girls Peeing O525547819 Call Girls Dubai
 
Predicting HDB Resale Prices - Conducting Linear Regression Analysis With Orange
Predicting HDB Resale Prices - Conducting Linear Regression Analysis With OrangePredicting HDB Resale Prices - Conducting Linear Regression Analysis With Orange
Predicting HDB Resale Prices - Conducting Linear Regression Analysis With Orange
 

vertopal.com_DataEncodingForDataClustering-5 (1).pdf

  • 1. import pandas as pd data=pd.read_csv("/content/DataSet.xlsx - Feuil1 (1).csv", encoding = "ISO-8859-1") data.head() Job title Company name 0 Développeur Dynamics 365 AIVENTU 1 Manager Development NeoXam Tunisia 2 Développeur ios/swift (+ 3ans dâ ™expérience) Asr € développement 3 Expert Technique en développement Java JSE (J... NeoXam 4 Senior Java Development Engineer NeoXam Tunisia Job function Job type 0 Engineering Full-time 1 Information Technology Full-time 2 IOS â “ Swift development Full-time € 3 Design and development of solutions, technical... Full-time 4 Participating in designing and analyzing solut... Full-time Industry sector Job recruiter ID 0 Information Technology and Services 1204 1 Information Technology and Services req557 2 Not specified Not specified 3 Banking and finance. Not provided 4 Information Technology and Services req439 Job website 0 NaN 1 neoxam.csod.com/ux/ats/careersite/6/home?c=neo... 2 NaN 3 Not provided 4 neoxam.csod.com/ux/ats/care Job location Application deadline 0 Les Berges du Lac, Tunis, Tunisia 31/7/2023 1 Tunis, Tunisia 7/7/2023 2 Sousse, Tunisia 30/6/2023 3 Tunis, Tunisia 22/6/2023 4 Tunis, Tunisia 22/6/2023 Required skills 0 C# or X++, VisualStudio, .NET Framework, SQL S... 1 Java or any other object-oriented language, Li...
  • 2. 2 IOS â “ Swift development, € 3 Java development, Design Patterns, Web service... 4 Design Pattern, Webservices SOAP/REST, Java Me... Required education 0 Bac+5 in computer science or equivalent 1 Bac+5 in engineering or equivalent 2 Not specified 3 Master's degree in computer engineering or equ... 4 Engineering degree or equivalent (BAC +5 or eq... Required language proficiency 0 Good level of French and English (spoken and w... 1 Fluent in French and English (both oral and wr... 2 Not specified 3 Good command of written and spoken English and... 4 English and French (oral and written) Contact email for job application 0 swissal@aiventu.com 1 not specified 2 not specified 3 Not provided 4 not specified # Drop the 'Job recruiter ID', 'Job website', and 'Contact email for job application' columns data = data.drop(['Application deadline','Job title','Job function','Job location','Job type','Company name','Job recruiter ID', 'Job website', 'Contact email for job application','Job type'], axis=1) data.info() <class 'pandas.core.frame.DataFrame'> RangeIndex: 129 entries, 0 to 128 Data columns (total 4 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Industry sector 121 non-null object 1 Required skills 120 non-null object 2 Required education 114 non-null object 3 Required language proficiency 121 non-null object dtypes: object(4) memory usage: 4.2+ KB data['Required skills'] = data['Required skills'].fillna('') import spacy from spacy.matcher import PhraseMatcher
  • 3. # Load the English language model for spaCy nlp = spacy.load('en_core_web_sm') # Define a list of skill phrases to match sql_skills = ["sql","PL.SQL","mysql","MS.SQL","SQL.Server","SQLite","PostgreSQL","R ubinius","T.SQL","mariadb","vsql"] nosql_skills=["CouchDB","cassandra","mongo","hbase","redis","Scylla"," Neo4j","DynamoDB","Memcached","Bigtable","Hypertable","Oracle","NoSQL" ] js_back_fram_skills=["javascript","Backbone","Express.js","kraken.js", "Hapi.js","Koa.js","TotalJS","Nest.js","Sails.Js","Meteor.Js","LoopBac k","Derby.Js","Adonis.Js","Mojito","Keystone.Js","Feathers.Js","Restif y.Js","ActionHero.Js","Sequelize","Moleculer"] js_front_framewrok=["javascript","angular","vue.js","react","ember","j query","Prototype","ext.js"] javascript_skills= ["node.js","javascript","babel","npm","eslint", "typescript" ] sys_skills=["linux","unix","ubuntu","redhat","SuSE","Debian","fedora", "Bash","nginx","apache","systemctl","bash","shell","systemd","ssh","ne twork"] c_skills=["sCs","C+ +","C#","wxWidgets","JUCE","CEGUI","CEF","GTK","Qt"] xml_skills=["xml","html"] css_skills=["css","bootstrap","sass","postcss"] rest_skills=["soa","soap","REST","http","ajax"] git_skills=["git","svn"] java_skills=["Java","JEE","Spring","Struts","Hibernate","Wicket","JSF" ,"Dropwizard","Grails","ATG","maven","spring.boot","spring.security"] continus_build_skills=["Buddy","git","Jenkins","TeamCity","GoCD","Bamb oo","CircleCI","Codeship","Buildbot","Nevercode","Integrity","Strider" ,"Autorabit","Buildkite","Semaphore","CruiseControl","Urbancode"] virt_skills=["Docker","Vagrant","Wox","Rancher","Kubernetes","Mesos"," LXC","OpenVZ", "kvm"] cloud_skills= ["Cloud", "Computing","AWS","google.cloud","Bluemix","OVH","Joyent","Microsoft.A zure","Cloudwatt","Ikoula","Rackspace","Nimbus","Niftyname","OpenStack ","OpenNebula","Eucalyptus","vultr","DigitalOcean"] php_skills=["php","symfony","CodeIgniter","Agavi","CakePHP","Dframe"," Flight","FuelPHP","Hoa","Horde","Jelix","KumbiaPHP","Laravel","Laminas ","Mkframework","MODx","PEAR","WebSite.PHP","Zend","YAF"] cms_skills=["drupal","wordpress","joomla","squarespace","magneto"] scrum_skills= ["Zimbra","confluence","Slack","Scrum","Wrike","Agile","Trello","JIRA" ,"Assembla","nTask","Targetprocess","Asana","Clarizen","QuickScrum","S crumDo","VivifyScrum","scrumban","Kanban","Waterfall","GitScrum"] testing_skills = ["Testrail","Zephyr","JMeter","TestLink","Selenium","QTP","SoapUI","Tr icentis.Tosca","Telerik","Katalon
  • 4. Studio","UFT","IBM.RFT","Ranorex","Postman"] back_skills=javascript_skills + sql_skills + nosql_skills+["node.js"] front_skills=["webpack"] teck_skills=["micro-services"] po_skills=["Team Building"] big_data_skills = ["Business Analytic","Mining","Warehous","décisionnelle","Splunk","Scala","Julia" , "Fluentd","Grafana","Memcached","KNIME","Statistica","Excel", "dashboarding","big data","hadoop", "Spark", "Storm", "RapidMiner","SAMOA","HPCC", "Quoble", "Hive", "Cloudera", "Openrefine", "Teradata","kafka"] networking_skills = ["cisco","pfsense","san", "sdnss", "dhcp", " snats", "huawei","Troubleshooting","ccna","socket","sip s","tcp","udp","ssh","telnet","slans", "vpn", "Packet Tracer"] bi_skills = ["SAP","Sisense","bods","Talend", "warehouse","QlikSense","Power.BI","Looker","Tableau","zoho","ssis","s sas","ssrs"] embarque_skills = ["raspberry","Stm","assembleur","arduino"] matlab_skills =["matlab"] security_skills = ["ips","audit","Symantec","Metasploit","Hacking","Vulnera","Phishing", "Malware"] telecom_skills = ["voip","gsm"] python_skills =["python","Tkinter","CherryPy","Django","Flask" ,"Pylons", "Pyramid", "Pylons", "Web2py","BeautifulSoup"] deep_skills = ["deep", "tensorflow", "Recurrent Neural","RNN", "Keras", "py.torsh","caffe", "convolution neural network", "cnn"] ml_skills = ["Learning","classification","regression", "clustering" ,"Computer vision","srilm", "theano","Lasagne", "scikit", "Anaconda", "scrap", "ADAS"] robo_skills =["kuka","linx","winautomation","automai"] elastic_search_skills = ["Elasticsearch", "Logstash", "Kibana"] r_skills = ["sRs"] #analytic_skills =["Matplotlib", "nltk","Bokeh","gensim"] # TODO: update list skill_phrases = [sql_skills, nosql_skills, js_back_fram_skills, js_front_framewrok, javascript_skills, sys_skills, c_skills, xml_skills, css_skills, rest_skills, git_skills, java_skills, continus_build_skills, virt_skills, cloud_skills, php_skills, cms_skills, scrum_skills, testing_skills, networking_skills, big_data_skills, bi_skills, embarque_skills, matlab_skills, security_skills, telecom_skills, python_skills, deep_skills, ml_skills,
  • 5. robo_skills,elastic_search_skills, r_skills ] all_skills = set([j.lower().strip() for i in skill_phrases for j in i]) reg_all_skills = "("+"|".join(all_skills)+")" import re # Define the skills # Create a regular expression pattern to match any of the skills # Apply the pattern to the Body column and extract the skills data['extracted_skills'] = data['Required skills'].str.findall(reg_all_skills, flags=re.IGNORECASE).apply(set) data.head() Industry sector 0 Information Technology and Services 1 Information Technology and Services 2 Not specified 3 Banking and finance. 4 Information Technology and Services Required skills 0 C# or X++, VisualStudio, .NET Framework, SQL S... 1 Java or any other object-oriented language, Li... 2 IOS â “ Swift development, € 3 Java development, Design Patterns, Web service... 4 Design Pattern, Webservices SOAP/REST, Java Me... Required education 0 Bac+5 in computer science or equivalent 1 Bac+5 in engineering or equivalent 2 Not specified 3 Master's degree in computer engineering or equ... 4 Engineering degree or equivalent (BAC +5 or eq... Required language proficiency 0 Good level of French and English (spoken and w... 1 Fluent in French and English (both oral and wr... 2 Not specified 3 Good command of written and spoken English and... 4 English and French (oral and written) extracted_skills 0 {C#, SQL Server} 1 {PostgreSQL, Linux, cloud, KANBAN, SQL Server,... 2 {} 3 {REST, HTML, SOAP, Java, CSS} 4 {REST, Unix, SQL Server, HTML, Oracle, SOAP, J...
  • 6. from sklearn.feature_extraction.text import CountVectorizer # Convert the extracted skills into a list of strings skill_strings = data['extracted_skills'].apply(lambda x: ' '.join(x)) # Create an instance of CountVectorizer with binary option vectorizer = CountVectorizer(binary=True) # Fit and transform the skill strings into a matrix of one hot encoded features skill_features = vectorizer.fit_transform(skill_strings) # Convert the matrix into a pandas DataFrame for easy manipulation skill_df = pd.DataFrame(skill_features.toarray(), columns=vectorizer.get_feature_names_out()) skill_df.head(10) agile angular aws babel bamboo bi bootstrap cloud computing css 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 1 0 0 2 0 0 0 0 0 0 0 0 0 0 3 0 0 0 0 0 0 0 0 0 1 4 0 0 0 0 0 0 0 0 0 1 5 0 0 0 0 0 0 0 0 0 1 6 0 1 0 0 0 0 0 0 0 1 7 0 0 0 0 0 0 0 0 0 0 8 0 0 0 0 1 0 0 0 0 0 9 0 0 0 0 1 0 0 0 0 0 ... symfony tcp typescript ubuntu unix voip warehous warehouse 0 ... 0 0 0 0 0 0 0 0 1 ... 0 0 0 0 0 0 0 0 2 ... 0 0 0 0 0 0 0 0
  • 7. 3 ... 0 0 0 0 0 0 0 0 4 ... 0 0 0 0 1 0 0 0 5 ... 0 0 0 0 1 0 0 0 6 ... 0 0 0 0 0 0 0 0 7 ... 0 0 0 0 0 0 0 0 8 ... 0 0 0 0 0 0 0 0 9 ... 0 0 0 0 1 0 0 0 wordpress xml 0 0 0 1 0 0 2 0 0 3 0 0 4 0 0 5 0 0 6 0 0 7 0 0 8 0 1 9 0 1 [10 rows x 74 columns] from sklearn.cluster import KMeans # Perform KMeans clustering kmeans_model = KMeans(n_clusters=15, init='k-means++', max_iter=100, n_init=1, random_state=42) kmeans_output = kmeans_model.fit_predict(skill_df) kmeans_output array([ 1, 2, 1, 4, 7, 7, 14, 1, 3, 1, 3, 3, 3, 3, 1, 1, 1, 14, 1, 1, 4, 12, 11, 11, 1, 14, 14, 14, 1, 1, 14, 14, 1, 1, 9, 5, 9, 1, 1, 2, 1, 1, 1, 1, 1, 1, 6, 1, 1, 1, 4, 1, 14, 1, 1, 1, 1, 0, 1, 5, 1, 1, 11, 1, 1, 1, 1, 14, 1, 1, 1, 14, 1, 1, 13, 1, 11, 5, 1, 1, 1, 1, 1, 6, 1, 1, 1, 1, 11, 4, 4, 8, 10, 1, 1, 1, 5, 1, 14, 14, 1, 1,
  • 8. 0, 1, 1, 14, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 14, 4, 11, 1, 1, 1, 4, 1], dtype=int32) kmeans_output.shape[0] 129 # Get the cluster centers and feature names cluster_centers = kmeans_model.cluster_centers_ feature_names = skill_df.columns # Print the top 5 features for each cluster for i, center in enumerate(cluster_centers): sorted_indices = center.argsort()[::-1][:5] print(f"Cluster {i}:") for idx in sorted_indices: print(f"t{feature_names[idx]}: {center[idx]}") Cluster 0: python: 1.0 sql: 1.0 hadoop: 0.5 learning: 0.5 nosql: 0.5 Cluster 1: agile: 0.10126582278480999 sql: 0.07594936708860747 excel: 0.0632911392405063 react: 0.05063291139240497 grafana: 0.03797468354430377 Cluster 2: postgresql: 1.0 linux: 1.0 java: 1.0 cloud: 1.0 server: 1.0 Cluster 3: bamboo: 1.0000000000000002 xml: 1.0 linux: 1.0 jenkins: 1.0 sql: 1.0 Cluster 4: java: 1.0 agile: 0.857142857142857 scrum: 0.7142857142857143 html: 0.7142857142857142 css: 0.5714285714285714 Cluster 5: network: 1.0
  • 9. linux: 1.0 cloud: 0.5 ubuntu: 0.25 tcp: 0.25 Cluster 6: css: 1.0 jquery: 1.0 react: 1.0 git: 1.0 server: 1.0 Cluster 7: css: 1.0 oracle: 1.0 html: 1.0 java: 1.0 server: 1.0 Cluster 8: mysql: 1.0 js: 1.0 git: 1.0 shell: 1.0 jquery: 1.0 Cluster 9: xml: 1.0 sql: 1.0 java: 1.0 oracle: 1.0 server: 1.0 Cluster 10: mysql: 1.0 bootstrap: 1.0 java: 1.0 react: 1.0 jquery: 1.0 Cluster 11: docker: 1.0 git: 0.8333333333333333 kubernetes: 0.49999999999999994 linux: 0.33333333333333337 agile: 0.3333333333333333 Cluster 12: hibernate: 1.0 postgresql: 1.0 redis: 1.0 ips: 1.0 java: 1.0 Cluster 13: maven: 1.0 js: 1.0 react: 1.0
  • 10. java: 1.0 ext: 1.0 Cluster 14: java: 0.857142857142857 angular: 0.5714285714285714 react: 0.35714285714285704 rest: 0.21428571428571427 html: 0.14285714285714288