How To Scrape LinkedIn Public Company Data

How To Scrape LinkedIn Public Company Data – Beginners Guide

Nowadays everybody is familiar with how big the LinkedIn community is. LinkedIn is one of the largest professional social networking sites in the world which holds a wealth of information about industry insights, data on professionals, and job data.

Now, the only way to get the entire data out of LinkedIn is through Web Scraping.

Why Scrape LinkedIn public data?

There are multiple reasons why one wants to scrape the data out of LinkedIn. The scrape data can be useful when you are associated with the project or for hiring multiple people based on their profile while looking at their data and selecting among them who all are applicable and fits for the company best.

This scraping task will be less time-consuming and will automate the process of searching for millions of data in a single file which will make the task easy.

Another benefit of scraping is when one wants to automate their job search. As every online site has thousands of job openings for different kinds of jobs, so it must be hectic for people who are looking for a job in their field only. So scraping can help them automate their job search by applying filters and extracting all the information at only one page.

In this tutorial, we will be scraping the data from LinkedIn using Python.

Prerequisites:

In this tutorial, we will use basic Python programming as well as some python packages- LXML and requests.

But first, you need to install the following things:

  1. Python accessible here (https://www.python.org/downloads/)
  2. Python requests accessible here(http://docs.python-requests.org/en/master/user/install/)
  3. Python LXML( Study how to install it here: http://lxml.de/installation.html)

Once you are done with installing here, we will write the python code to extract the LinkedIn public data from company pages.

This below code will only run on python 2 and not above them because the sys function is not supported in it.

import json

import re

from importlib import reload

import lxml.html

import requests

import sys

reload(sys)

sys.setdefaultencoding('cp1251')




HEADERS = {'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',

          'accept-encoding': 'gzip, deflate, sdch',

          'accept-language': 'en-US,en;q=0.8',

          'upgrade-insecure-requests': '1',

          'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.85 Safari/537.36'}

file = open('company_data.json', 'w')

file.write('[')

file.close()

COUNT = 0




def increment():

   global COUNT

   COUNT = COUNT+1




def fetch_request(url):

   try:

       fetch_url = requests.get(url, headers=HEADERS)

   except:

       try:

           fetch_url = requests.get(url, headers=HEADERS)

       except:

           try:

               fetch_url = requests.get(url, headers=HEADERS)

           except:

               fetch_url = ''

   return fetch_url




def parse_company_urls(company_url):




   if company_url:

       if '/company/' in company_url:

           parse_company_data(company_url)

       else:

           parent_url = company_url

           fetch_company_url=fetch_request(company_url)

           if fetch_company_url:

               sel = lxml.html.fromstring(fetch_company_url.content)

               COMPANIES_XPATH = '//div[@class="section last"]/div/ul/li/a/@href'

               companies_urls = sel.xpath(COMPANIES_XPATH)

               if companies_urls:

                   if '/company/' in companies_urls[0]:

                       print('Parsing From Category ', parent_url)

                       print('-------------------------------------------------------------------------------------')

                   for company_url in companies_urls:

                       parse_company_urls(company_url)

           else:

               pass







def parse_company_data(company_data_url):




   if company_data_url:

       fetch_company_data = fetch_request(company_data_url)

       if fetch_company_data.status_code == 200:

           try:

               source = fetch_company_data.content.decode('utf-8')

               sel = lxml.html.fromstring(source)

               # CODE_XPATH = '//code[@id="stream-promo-top-bar-embed-id-content"]'

               # code_text = sel.xpath(CODE_XPATH).re(r'<!--(.*)-->')

               code_text = sel.get_element_by_id(

                   'stream-promo-top-bar-embed-id-content')

               if len(code_text) > 0:

                   code_text = str(code_text[0])

                   code_text = re.findall(r'<!--(.*)-->', str(code_text))

                   code_text = code_text[0].strip() if code_text else '{}'

                   json_data = json.loads(code_text)

                   if json_data.get('squareLogo', ''):

                       company_pic = 'https://media.licdn.com/mpr/mpr/shrink_200_200' + \

                                     json_data.get('squareLogo', '')

                   elif json_data.get('legacyLogo', ''):

                       company_pic = 'https://media.licdn.com/media' + \

                                     json_data.get('legacyLogo', '')

                   else:

                       company_pic = ''

                   company_name = json_data.get('companyName', '')

                   followers = str(json_data.get('followerCount', ''))




                   # CODE_XPATH = '//code[@id="stream-about-section-embed-id-content"]'

                   # code_text = sel.xpath(CODE_XPATH).re(r'<!--(.*)-->')

                   code_text = sel.get_element_by_id(

                       'stream-about-section-embed-id-content')

               if len(code_text) > 0:

                   code_text = str(code_text[0]).encode('utf-8')

                   code_text = re.findall(r'<!--(.*)-->', str(code_text))

                   code_text = code_text[0].strip() if code_text else '{}'

                   json_data = json.loads(code_text)

                   company_industry = json_data.get('industry', '')

                   item = {'company_name': str(company_name.encode('utf-8')),

                           'followers': str(followers),

                           'company_industry': str(company_industry.encode('utf-8')),

                           'logo_url': str(company_pic),

                           'url': str(company_data_url.encode('utf-8')), }

                   increment()

                   print(item)

                   file = open('company_data.json', 'a')

                   file.write(str(item)+',\n')

                   file.close()

           except:

               pass

       else:

           pass
fetch_company_dir = fetch_request('https://www.linkedin.com/directory/companies/')

if fetch_company_dir:

   print('Starting Company Url Scraping')

   print('-----------------------------')

   sel = lxml.html.fromstring(fetch_company_dir.content)

   SUB_PAGES_XPATH = '//div[@class="bucket-list-container"]/ol/li/a/@href'

   sub_pages = sel.xpath(SUB_PAGES_XPATH)

   print('Company Category URL list')

   print('--------------------------')

   print(sub_pages)

   if sub_pages:

       for sub_page in sub_pages:

           parse_company_urls(sub_page)

else:

   pass