Parsing web site for job offers

Given The website with posted offers.
Goal: to get information from the website using python, BeautifulSoup and save it in JSON and markdown files.

Python scirpt

Install and import required packages

1
2
import json
from bs4 import BeautifulSoup

Execute request

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
def execute_requests(base_url, amount_of_pages):
"""
Executes GET requests for the specified number of pages and returns the responses.

Args:
base_url (str): The base URL for requests.
amount_of_pages (int): The number of pages to fetch.

Returns:
list: A list of dictionaries containing the request number, page counter, and response content.
"""
payload = {}
headers = {
'Cookie': '_jobboard_session=895b7b35b6493519c3ad686923d8cc1d; __cf_bm=BrUIPeJX6XqIr7jlW.4M-1732742386-1.0.1.1-1hk8BgPr6ZL6QswlF6K2dUhchp0reiDPOXzX6z.etyq.IUHZqPg'
}

responses_data = [] # Initialize an empty list to store response data

for counter in range(1, amount_of_pages + 1):
# Construct the URL with the current page counter
url = f"{base_url}&page={counter}"
print(f"Fetching data from: {url}")

try:
# Send GET request
response = requests.get(url, headers=headers, data=payload)

# Append the response data to the list
responses_data.append({
"request_key": f"request_{counter}",
"counter": counter,
"response_content": response.text
})
print(f"counter: {counter} | 'status_code:' {response.status_code}")
except requests.RequestException as e:
print(f"Error fetching data for page {counter}: {e}")

return responses_data

Parse data from json

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67

def parse_job_data_from_json(response_data, output_json_file, output_markdown_file):
"""
Parse job data from a list of responses and extract job listings using BeautifulSoup.
Save results to both a JSON file and a Markdown file.

Args:
response_data (list): List of dictionaries containing the response data.
output_json_file (str): Path to save the parsed job data in JSON format.
output_markdown_file (str): Path to save the parsed job data in Markdown format.
"""
try:
job_data = [] # List to store extracted job data
markdown_content = [] # List to store Markdown entries

# Loop through each request in the list
for request in response_data:
counter = request.get("counter", "unknown")
response_content = request.get("response_content", "")

# Parse the HTML content using BeautifulSoup
soup = BeautifulSoup(response_content, 'html.parser')

# Find all job listings using the locator
job_listings = soup.find_all('li', class_='job-listing')

# Extract data from each job listing
for job in job_listings:
# Safely find required elements, fallback to 'N/A' if not present
job_title = job.find('a', class_='jobList-title zip-backfill-link')
job_description = job.find('div', class_='jobList-description')
salary = job.find('div', class_='jobList-salary')

job_info = {
'title': job_title.text.strip() if job_title else 'N/A',
'href': job_title['href'] if job_title else 'N/A',
'description': job_description.text.strip() if job_description else 'N/A',
'salary': salary.text.strip() if salary else 'N/A',
'page': counter
}
job_data.append(job_info)

# Prepare entry for Markdown
markdown_entry = f"""
### Job Title: {job_info['title']}
- **Link**: [{job_info['title']}]({job_info['href']})
- **Description**: {job_info['description']}
- **Salary**: {job_info['salary']}
- **Page**: {job_info['page']}
"""
markdown_content.append(markdown_entry.strip())

# Save extracted job data to a new JSON file
with open(output_json_file, 'w', encoding='utf-8') as f:
json.dump(job_data, f, ensure_ascii=False, indent=4)

print(f"Job data successfully parsed and saved to {output_json_file}")

# Save Markdown content to a file
with open(output_markdown_file, 'w', encoding='utf-8') as f:
f.write("\n\n".join(markdown_content))

print(f"Job data successfully saved to {output_markdown_file}")

except Exception as e:
print(f"Error processing file: {e}")

IMPORTANT !

  • Provide valid references for saving retrieved data.
  • Make sure that you copied valid url from the browser and manage pagination properly.
1
2
3
4
5
6
7
8
9
10
11
if __name__ == '__main__':
base_url = "https://www.ziprecruiter.co.uk/jobs/search?l=Remote&q=qa+software+engineer&remote=full"
amount_of_pages = 100 # Or any number that you wish to check
responses_data = execute_requests(base_url, amount_of_pages)

# Output files for parsed data
output_json_file = 'parsed_job_data.json'
output_markdown_file = 'parsed_job_data.md'

# Parse and save the job data
parse_job_data_from_json(responses_data, output_json_file, output_markdown_file)

Script works fine for several executions. After that cookies expired and new one should be regenerated.