Coverage for backend / app / job_email_scraping / job_scrapers / indeed.py: 46%

39 statements  

« prev     ^ index     » next       coverage.py v7.13.5, created at 2026-03-17 21:34 +0000

1"""Indeed Job Scrapers""" 

2 

3import re 

4 

5from app.job_email_scraping.job_scrapers.apify import ApifyJobScraper 

6from app.job_email_scraping.job_scrapers.brightdata import BrightdataJobScraper 

7from app.job_email_scraping.schemas import Salary, JobInfo, JobResult 

8 

9 

10class IndeedBrightdataJobScraper(BrightdataJobScraper): 

11 """LinkedIn Scraper""" 

12 

13 base_url = "https://www.indeed.com/viewjob?jk=" 

14 name = "indeed" 

15 poll_interval: int | float = 10 

16 max_attempts: int = 100 

17 

18 def _process_job_data(self, job_data: dict) -> JobResult: 

19 """Process job data to extract relevant information 

20 :param job_data: Job data dictionary 

21 :return: Dictionary containing job information""" 

22 

23 # Extract the yearly salary 

24 salary_pattern = ( 

25 r"£(\d+(?:,\d+)?(?:k|K)?(?:\.\d+)?)\s*[-–]\s*£(\d+(?:,\d+)?(?:k|K)?(?:\.\d+)?)\s+(?:a|per)\s+(?:year|annum)" 

26 ) 

27 salary_range = job_data.get("salary_formatted") 

28 if salary_range and (match := re.search(salary_pattern, salary_range)): 

29 min_amount = float(match.group(1).replace(",", "")) 

30 max_amount = float(match.group(2).replace(",", "")) 

31 currency = "GBP" 

32 else: 

33 min_amount = None 

34 max_amount = None 

35 currency = None 

36 

37 return JobResult( 

38 company=job_data.get("company_name"), 

39 company_id=job_data.get("company_url"), 

40 location=job_data.get("location"), 

41 job=JobInfo( 

42 title=job_data.get("job_title"), 

43 description=job_data.get("description_text", "").strip("Show more Show less") or None, 

44 url=job_data.get("url"), 

45 salary=Salary( 

46 min_amount=min_amount, 

47 max_amount=max_amount, 

48 currency=currency, 

49 ), 

50 ), 

51 raw=str(job_data), 

52 ) 

53 

54 

55class IndeedApifyJobScraper(ApifyJobScraper): 

56 """Indeed Scraper using Apify""" 

57 

58 base_url = "https://www.indeed.com/viewjob?jk=" 

59 name = "indeed" 

60 actor_id = "memo23/apify-indeed-cheerio-ppr" 

61 poll_interval: int | float = 10 

62 max_attempts: int = 100 

63 

64 def _process_job_data(self, job_data: dict) -> JobResult: 

65 """Process job data to extract relevant information 

66 :param job_data: Job data dictionary from Apify Indeed actor 

67 :return: JobResult containing job information""" 

68 

69 # Extract job info from nested structure 

70 title = job_data["jobInfoModel"]["jobInfoHeaderModel"]["jobTitle"] 

71 location = job_data["jobInfoModel"]["location"]["fullAddress"] 

72 company = job_data["jobInfoModel"]["jobInfoHeaderModel"]["companyName"] 

73 description = job_data["jobInfoModel"]["description"]["text"] 

74 

75 return JobResult( 

76 company=company, 

77 location=location, 

78 job=JobInfo( 

79 title=title, 

80 description=description, 

81 ), 

82 raw=str(job_data), 

83 ) 

84 

85 

86if __name__ == "__main__": 

87 # Indeed job scraper example with Brightdata 

88 scraper = IndeedBrightdataJobScraper("758f2768706ab970") 

89 data = scraper.scrape_job() 

90 print(data) 

91 

92 # # Indeed job scraper example with Apify 

93 scraper = IndeedApifyJobScraper("758f2768706ab970") 

94 data = scraper.scrape_job() 

95 print(data)