Coverage for backend/app/job_email_scraping/job

1"""NHS job scraper module"""

3import datetime as dt

4import re

6from apify_client import ApifyClient

8from app.config import settings

9from app.job_email_scraping.schemas import Salary, JobInfo, JobResult

12class NhsJobScraper:

13 """Scraper for NHS job listings."""

15 base_url = "https://beta.jobs.nhs.uk/candidate/jobadvert/"

17 def __init__(self, job_ids: str | list[str]) -> None:

18 """Initialize the scraper with headers and delay settings.

19 :param job_ids: The job listing ID(s)"""

21 self.job_ids = [job_ids] if isinstance(job_ids, str) else job_ids

22 self.job_urls = [f"{self.base_url}{job_id}" for job_id in self.job_ids]

24 def scrape_job(self) -> list[JobResult]:

25 """Scrape job data from a specific NHS job listing URL"""

27 client = ApifyClient(settings.apify_api_key)

29 run_input = {

30 "proxy": {

31 "useApifyProxy": True,

32 "apifyProxyGroups": ["RESIDENTIAL"],

33 },

34 "startUrls": self.job_urls,

35 }

37 actor_id = "memo23/nhs-scraper"

39 run = client.actor(actor_id).call(run_input=run_input)

40 job_data = client.dataset(run["defaultDatasetId"]).list_items().items

41 if not job_data:

42 raise Exception("No job data found.")

44 processed_job_data = []

45 for job in job_data:

47 # Deadline

48 deadline = None

49 is_closed = False

50 if job.get("closingDate", "").upper() == "THIS JOB IS NOW CLOSED":

51 is_closed = True

52 else:

53 try:

54 deadline = dt.datetime.strptime(job.get("closingDate"), "%d %B %Y")

55 except:

56 pass

58 # Salary

59 pattern = r"(?P<currency>£)\s*(?P<min>[\d,]+)\s*to\s*(?P=currency)\s*(?P<max>[\d,]+).*?(?P<frequency>a year|per annum)"

60 match = re.search(pattern, job.get("salary") or "", re.IGNORECASE)

62 min_salary = max_salary = None

63 currency = None

64 if match:

65 frequency = match.group("frequency").lower()

66 if "year" in frequency or "annum" in frequency:

67 currency = match.group("currency")

68 min_salary = int(match.group("min").replace(",", ""))

69 max_salary = int(match.group("max").replace(",", ""))

71 # Description

72 description = [job.get("jobSummaryText"), job.get("mainDutiesText"), job.get("aboutUsText")]

73 description = "\n\n".join([d for d in description if d])

75 # Raise an exception if planned downtime

76 if (

77 job.get("title") == "NHS Jobs: Planned downtime"

78 or job.get("title") == "Sorry, there is a problem with the service"

79 ):

80 raise Exception(job.get("title"))

82 processed_job_data.append(

83 JobResult(

84 company=job.get("employer") or None,

85 location=" ".join(job.get("employerAddress", "")) or None,

86 job=JobInfo(

87 title=job.get("title") or None,

88 description=description or None,

89 deadline=deadline,

90 is_closed=is_closed,

91 salary=Salary(

92 min_amount=min_salary,

93 max_amount=max_salary,

94 currency=currency,

95 ),

96 ),

97 raw=str(job),

98 )

99 )

100

101 return processed_job_data

102

103

104if __name__ == "__main__":

105 scraper = NhsJobScraper("M9043-25-0282")

106 nhsjob_data = scraper.scrape_job()

107 print(nhsjob_data)

Coverage for backend / app / job_email_scraping / job_scrapers / nhs.py: 21%

48 statements