Coverage for backend / app / job_email_scraping / job_scrapers / nhs.py: 21%

48 statements  

« prev     ^ index     » next       coverage.py v7.13.5, created at 2026-03-17 21:34 +0000

1"""NHS job scraper module""" 

2 

3import datetime as dt 

4import re 

5 

6from apify_client import ApifyClient 

7 

8from app.config import settings 

9from app.job_email_scraping.schemas import Salary, JobInfo, JobResult 

10 

11 

12class NhsJobScraper: 

13 """Scraper for NHS job listings.""" 

14 

15 base_url = "https://beta.jobs.nhs.uk/candidate/jobadvert/" 

16 

17 def __init__(self, job_ids: str | list[str]) -> None: 

18 """Initialize the scraper with headers and delay settings. 

19 :param job_ids: The job listing ID(s)""" 

20 

21 self.job_ids = [job_ids] if isinstance(job_ids, str) else job_ids 

22 self.job_urls = [f"{self.base_url}{job_id}" for job_id in self.job_ids] 

23 

24 def scrape_job(self) -> list[JobResult]: 

25 """Scrape job data from a specific NHS job listing URL""" 

26 

27 client = ApifyClient(settings.apify_api_key) 

28 

29 run_input = { 

30 "proxy": { 

31 "useApifyProxy": True, 

32 "apifyProxyGroups": ["RESIDENTIAL"], 

33 }, 

34 "startUrls": self.job_urls, 

35 } 

36 

37 actor_id = "memo23/nhs-scraper" 

38 

39 run = client.actor(actor_id).call(run_input=run_input) 

40 job_data = client.dataset(run["defaultDatasetId"]).list_items().items 

41 if not job_data: 

42 raise Exception("No job data found.") 

43 

44 processed_job_data = [] 

45 for job in job_data: 

46 

47 # Deadline 

48 deadline = None 

49 is_closed = False 

50 if job.get("closingDate", "").upper() == "THIS JOB IS NOW CLOSED": 

51 is_closed = True 

52 else: 

53 try: 

54 deadline = dt.datetime.strptime(job.get("closingDate"), "%d %B %Y") 

55 except: 

56 pass 

57 

58 # Salary 

59 pattern = r"(?P<currency>£)\s*(?P<min>[\d,]+)\s*to\s*(?P=currency)\s*(?P<max>[\d,]+).*?(?P<frequency>a year|per annum)" 

60 match = re.search(pattern, job.get("salary") or "", re.IGNORECASE) 

61 

62 min_salary = max_salary = None 

63 currency = None 

64 if match: 

65 frequency = match.group("frequency").lower() 

66 if "year" in frequency or "annum" in frequency: 

67 currency = match.group("currency") 

68 min_salary = int(match.group("min").replace(",", "")) 

69 max_salary = int(match.group("max").replace(",", "")) 

70 

71 # Description 

72 description = [job.get("jobSummaryText"), job.get("mainDutiesText"), job.get("aboutUsText")] 

73 description = "\n\n".join([d for d in description if d]) 

74 

75 # Raise an exception if planned downtime 

76 if ( 

77 job.get("title") == "NHS Jobs: Planned downtime" 

78 or job.get("title") == "Sorry, there is a problem with the service" 

79 ): 

80 raise Exception(job.get("title")) 

81 

82 processed_job_data.append( 

83 JobResult( 

84 company=job.get("employer") or None, 

85 location=" ".join(job.get("employerAddress", "")) or None, 

86 job=JobInfo( 

87 title=job.get("title") or None, 

88 description=description or None, 

89 deadline=deadline, 

90 is_closed=is_closed, 

91 salary=Salary( 

92 min_amount=min_salary, 

93 max_amount=max_salary, 

94 currency=currency, 

95 ), 

96 ), 

97 raw=str(job), 

98 ) 

99 ) 

100 

101 return processed_job_data 

102 

103 

104if __name__ == "__main__": 

105 scraper = NhsJobScraper("M9043-25-0282") 

106 nhsjob_data = scraper.scrape_job() 

107 print(nhsjob_data)