Coverage for backend / app / job_email_scraping / job_scrapers / indeed.py: 46%
39 statements
« prev ^ index » next coverage.py v7.13.5, created at 2026-03-17 21:34 +0000
« prev ^ index » next coverage.py v7.13.5, created at 2026-03-17 21:34 +0000
1"""Indeed Job Scrapers"""
3import re
5from app.job_email_scraping.job_scrapers.apify import ApifyJobScraper
6from app.job_email_scraping.job_scrapers.brightdata import BrightdataJobScraper
7from app.job_email_scraping.schemas import Salary, JobInfo, JobResult
10class IndeedBrightdataJobScraper(BrightdataJobScraper):
11 """LinkedIn Scraper"""
13 base_url = "https://www.indeed.com/viewjob?jk="
14 name = "indeed"
15 poll_interval: int | float = 10
16 max_attempts: int = 100
18 def _process_job_data(self, job_data: dict) -> JobResult:
19 """Process job data to extract relevant information
20 :param job_data: Job data dictionary
21 :return: Dictionary containing job information"""
23 # Extract the yearly salary
24 salary_pattern = (
25 r"£(\d+(?:,\d+)?(?:k|K)?(?:\.\d+)?)\s*[-–]\s*£(\d+(?:,\d+)?(?:k|K)?(?:\.\d+)?)\s+(?:a|per)\s+(?:year|annum)"
26 )
27 salary_range = job_data.get("salary_formatted")
28 if salary_range and (match := re.search(salary_pattern, salary_range)):
29 min_amount = float(match.group(1).replace(",", ""))
30 max_amount = float(match.group(2).replace(",", ""))
31 currency = "GBP"
32 else:
33 min_amount = None
34 max_amount = None
35 currency = None
37 return JobResult(
38 company=job_data.get("company_name"),
39 company_id=job_data.get("company_url"),
40 location=job_data.get("location"),
41 job=JobInfo(
42 title=job_data.get("job_title"),
43 description=job_data.get("description_text", "").strip("Show more Show less") or None,
44 url=job_data.get("url"),
45 salary=Salary(
46 min_amount=min_amount,
47 max_amount=max_amount,
48 currency=currency,
49 ),
50 ),
51 raw=str(job_data),
52 )
55class IndeedApifyJobScraper(ApifyJobScraper):
56 """Indeed Scraper using Apify"""
58 base_url = "https://www.indeed.com/viewjob?jk="
59 name = "indeed"
60 actor_id = "memo23/apify-indeed-cheerio-ppr"
61 poll_interval: int | float = 10
62 max_attempts: int = 100
64 def _process_job_data(self, job_data: dict) -> JobResult:
65 """Process job data to extract relevant information
66 :param job_data: Job data dictionary from Apify Indeed actor
67 :return: JobResult containing job information"""
69 # Extract job info from nested structure
70 title = job_data["jobInfoModel"]["jobInfoHeaderModel"]["jobTitle"]
71 location = job_data["jobInfoModel"]["location"]["fullAddress"]
72 company = job_data["jobInfoModel"]["jobInfoHeaderModel"]["companyName"]
73 description = job_data["jobInfoModel"]["description"]["text"]
75 return JobResult(
76 company=company,
77 location=location,
78 job=JobInfo(
79 title=title,
80 description=description,
81 ),
82 raw=str(job_data),
83 )
86if __name__ == "__main__":
87 # Indeed job scraper example with Brightdata
88 scraper = IndeedBrightdataJobScraper("758f2768706ab970")
89 data = scraper.scrape_job()
90 print(data)
92 # # Indeed job scraper example with Apify
93 scraper = IndeedApifyJobScraper("758f2768706ab970")
94 data = scraper.scrape_job()
95 print(data)