Coverage for backend / app / job_email_scraping / job_scrapers / nhs.py: 21%
48 statements
« prev ^ index » next coverage.py v7.13.5, created at 2026-03-17 21:34 +0000
« prev ^ index » next coverage.py v7.13.5, created at 2026-03-17 21:34 +0000
1"""NHS job scraper module"""
3import datetime as dt
4import re
6from apify_client import ApifyClient
8from app.config import settings
9from app.job_email_scraping.schemas import Salary, JobInfo, JobResult
12class NhsJobScraper:
13 """Scraper for NHS job listings."""
15 base_url = "https://beta.jobs.nhs.uk/candidate/jobadvert/"
17 def __init__(self, job_ids: str | list[str]) -> None:
18 """Initialize the scraper with headers and delay settings.
19 :param job_ids: The job listing ID(s)"""
21 self.job_ids = [job_ids] if isinstance(job_ids, str) else job_ids
22 self.job_urls = [f"{self.base_url}{job_id}" for job_id in self.job_ids]
24 def scrape_job(self) -> list[JobResult]:
25 """Scrape job data from a specific NHS job listing URL"""
27 client = ApifyClient(settings.apify_api_key)
29 run_input = {
30 "proxy": {
31 "useApifyProxy": True,
32 "apifyProxyGroups": ["RESIDENTIAL"],
33 },
34 "startUrls": self.job_urls,
35 }
37 actor_id = "memo23/nhs-scraper"
39 run = client.actor(actor_id).call(run_input=run_input)
40 job_data = client.dataset(run["defaultDatasetId"]).list_items().items
41 if not job_data:
42 raise Exception("No job data found.")
44 processed_job_data = []
45 for job in job_data:
47 # Deadline
48 deadline = None
49 is_closed = False
50 if job.get("closingDate", "").upper() == "THIS JOB IS NOW CLOSED":
51 is_closed = True
52 else:
53 try:
54 deadline = dt.datetime.strptime(job.get("closingDate"), "%d %B %Y")
55 except:
56 pass
58 # Salary
59 pattern = r"(?P<currency>£)\s*(?P<min>[\d,]+)\s*to\s*(?P=currency)\s*(?P<max>[\d,]+).*?(?P<frequency>a year|per annum)"
60 match = re.search(pattern, job.get("salary") or "", re.IGNORECASE)
62 min_salary = max_salary = None
63 currency = None
64 if match:
65 frequency = match.group("frequency").lower()
66 if "year" in frequency or "annum" in frequency:
67 currency = match.group("currency")
68 min_salary = int(match.group("min").replace(",", ""))
69 max_salary = int(match.group("max").replace(",", ""))
71 # Description
72 description = [job.get("jobSummaryText"), job.get("mainDutiesText"), job.get("aboutUsText")]
73 description = "\n\n".join([d for d in description if d])
75 # Raise an exception if planned downtime
76 if (
77 job.get("title") == "NHS Jobs: Planned downtime"
78 or job.get("title") == "Sorry, there is a problem with the service"
79 ):
80 raise Exception(job.get("title"))
82 processed_job_data.append(
83 JobResult(
84 company=job.get("employer") or None,
85 location=" ".join(job.get("employerAddress", "")) or None,
86 job=JobInfo(
87 title=job.get("title") or None,
88 description=description or None,
89 deadline=deadline,
90 is_closed=is_closed,
91 salary=Salary(
92 min_amount=min_salary,
93 max_amount=max_salary,
94 currency=currency,
95 ),
96 ),
97 raw=str(job),
98 )
99 )
101 return processed_job_data
104if __name__ == "__main__":
105 scraper = NhsJobScraper("M9043-25-0282")
106 nhsjob_data = scraper.scrape_job()
107 print(nhsjob_data)