2. 2 IOS â “ Swift development,
€
3 Java development, Design Patterns, Web service...
4 Design Pattern, Webservices SOAP/REST, Java Me...
Required education
0 Bac+5 in computer science or equivalent
1 Bac+5 in engineering or equivalent
2 Not specified
3 Master's degree in computer engineering or equ...
4 Engineering degree or equivalent (BAC +5 or eq...
Required language proficiency
0 Good level of French and English (spoken and w...
1 Fluent in French and English (both oral and wr...
2 Not specified
3 Good command of written and spoken English and...
4 English and French (oral and written)
Contact email for job application
0 swissal@aiventu.com
1 not specified
2 not specified
3 Not provided
4 not specified
# Drop the 'Job recruiter ID', 'Job website', and 'Contact email for
job application' columns
data = data.drop(['Application deadline','Job title','Job
function','Job location','Job type','Company name','Job recruiter ID',
'Job website', 'Contact email for job application','Job type'],
axis=1)
data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 129 entries, 0 to 128
Data columns (total 4 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Industry sector 121 non-null object
1 Required skills 120 non-null object
2 Required education 114 non-null object
3 Required language proficiency 121 non-null object
dtypes: object(4)
memory usage: 4.2+ KB
data['Required skills'] = data['Required skills'].fillna('')
import spacy
from spacy.matcher import PhraseMatcher
3. # Load the English language model for spaCy
nlp = spacy.load('en_core_web_sm')
# Define a list of skill phrases to match
sql_skills =
["sql","PL.SQL","mysql","MS.SQL","SQL.Server","SQLite","PostgreSQL","R
ubinius","T.SQL","mariadb","vsql"]
nosql_skills=["CouchDB","cassandra","mongo","hbase","redis","Scylla","
Neo4j","DynamoDB","Memcached","Bigtable","Hypertable","Oracle","NoSQL"
]
js_back_fram_skills=["javascript","Backbone","Express.js","kraken.js",
"Hapi.js","Koa.js","TotalJS","Nest.js","Sails.Js","Meteor.Js","LoopBac
k","Derby.Js","Adonis.Js","Mojito","Keystone.Js","Feathers.Js","Restif
y.Js","ActionHero.Js","Sequelize","Moleculer"]
js_front_framewrok=["javascript","angular","vue.js","react","ember","j
query","Prototype","ext.js"]
javascript_skills= ["node.js","javascript","babel","npm","eslint",
"typescript" ]
sys_skills=["linux","unix","ubuntu","redhat","SuSE","Debian","fedora",
"Bash","nginx","apache","systemctl","bash","shell","systemd","ssh","ne
twork"]
c_skills=["sCs","C+
+","C#","wxWidgets","JUCE","CEGUI","CEF","GTK","Qt"]
xml_skills=["xml","html"]
css_skills=["css","bootstrap","sass","postcss"]
rest_skills=["soa","soap","REST","http","ajax"]
git_skills=["git","svn"]
java_skills=["Java","JEE","Spring","Struts","Hibernate","Wicket","JSF"
,"Dropwizard","Grails","ATG","maven","spring.boot","spring.security"]
continus_build_skills=["Buddy","git","Jenkins","TeamCity","GoCD","Bamb
oo","CircleCI","Codeship","Buildbot","Nevercode","Integrity","Strider"
,"Autorabit","Buildkite","Semaphore","CruiseControl","Urbancode"]
virt_skills=["Docker","Vagrant","Wox","Rancher","Kubernetes","Mesos","
LXC","OpenVZ", "kvm"]
cloud_skills= ["Cloud",
"Computing","AWS","google.cloud","Bluemix","OVH","Joyent","Microsoft.A
zure","Cloudwatt","Ikoula","Rackspace","Nimbus","Niftyname","OpenStack
","OpenNebula","Eucalyptus","vultr","DigitalOcean"]
php_skills=["php","symfony","CodeIgniter","Agavi","CakePHP","Dframe","
Flight","FuelPHP","Hoa","Horde","Jelix","KumbiaPHP","Laravel","Laminas
","Mkframework","MODx","PEAR","WebSite.PHP","Zend","YAF"]
cms_skills=["drupal","wordpress","joomla","squarespace","magneto"]
scrum_skills=
["Zimbra","confluence","Slack","Scrum","Wrike","Agile","Trello","JIRA"
,"Assembla","nTask","Targetprocess","Asana","Clarizen","QuickScrum","S
crumDo","VivifyScrum","scrumban","Kanban","Waterfall","GitScrum"]
testing_skills =
["Testrail","Zephyr","JMeter","TestLink","Selenium","QTP","SoapUI","Tr
icentis.Tosca","Telerik","Katalon
5. robo_skills,elastic_search_skills, r_skills ]
all_skills = set([j.lower().strip() for i in skill_phrases for j in
i])
reg_all_skills = "("+"|".join(all_skills)+")"
import re
# Define the skills
# Create a regular expression pattern to match any of the skills
# Apply the pattern to the Body column and extract the skills
data['extracted_skills'] = data['Required
skills'].str.findall(reg_all_skills, flags=re.IGNORECASE).apply(set)
data.head()
Industry sector
0 Information Technology and Services
1 Information Technology and Services
2 Not specified
3 Banking and finance.
4 Information Technology and Services
Required skills
0 C# or X++, VisualStudio, .NET Framework, SQL S...
1 Java or any other object-oriented language, Li...
2 IOS â “ Swift development,
€
3 Java development, Design Patterns, Web service...
4 Design Pattern, Webservices SOAP/REST, Java Me...
Required education
0 Bac+5 in computer science or equivalent
1 Bac+5 in engineering or equivalent
2 Not specified
3 Master's degree in computer engineering or equ...
4 Engineering degree or equivalent (BAC +5 or eq...
Required language proficiency
0 Good level of French and English (spoken and w...
1 Fluent in French and English (both oral and wr...
2 Not specified
3 Good command of written and spoken English and...
4 English and French (oral and written)
extracted_skills
0 {C#, SQL Server}
1 {PostgreSQL, Linux, cloud, KANBAN, SQL Server,...
2 {}
3 {REST, HTML, SOAP, Java, CSS}
4 {REST, Unix, SQL Server, HTML, Oracle, SOAP, J...
6. from sklearn.feature_extraction.text import CountVectorizer
# Convert the extracted skills into a list of strings
skill_strings = data['extracted_skills'].apply(lambda x: ' '.join(x))
# Create an instance of CountVectorizer with binary option
vectorizer = CountVectorizer(binary=True)
# Fit and transform the skill strings into a matrix of one hot encoded
features
skill_features = vectorizer.fit_transform(skill_strings)
# Convert the matrix into a pandas DataFrame for easy manipulation
skill_df = pd.DataFrame(skill_features.toarray(),
columns=vectorizer.get_feature_names_out())
skill_df.head(10)
agile angular aws babel bamboo bi bootstrap cloud computing
css
0 0 0 0 0 0 0 0 0 0
0
1 0 0 1 0 0 0 0 1 0
0
2 0 0 0 0 0 0 0 0 0
0
3 0 0 0 0 0 0 0 0 0
1
4 0 0 0 0 0 0 0 0 0
1
5 0 0 0 0 0 0 0 0 0
1
6 0 1 0 0 0 0 0 0 0
1
7 0 0 0 0 0 0 0 0 0
0
8 0 0 0 0 1 0 0 0 0
0
9 0 0 0 0 1 0 0 0 0
0
... symfony tcp typescript ubuntu unix voip warehous
warehouse
0 ... 0 0 0 0 0 0 0
0
1 ... 0 0 0 0 0 0 0
0
2 ... 0 0 0 0 0 0 0
0