Coverage for backend / app / job_email_scraping / job_scrapers / veganjobs.py: 22%

46 statements  

« prev     ^ index     » next       coverage.py v7.13.5, created at 2026-03-17 21:34 +0000

1"""veganjobs.com job scraper module""" 

2 

3import re 

4 

5import cloudscraper 

6from bs4 import BeautifulSoup 

7 

8from app.job_email_scraping.schemas import Salary, JobInfo, JobResult 

9 

10 

11class VeganJobsJobScraper: 

12 """Scraper for veganjobs.com job listings.""" 

13 

14 base_url = "https://veganjobs.com/job/" 

15 

16 def __init__(self, job_ids: str | list[str]) -> None: 

17 """Initialize the scraper with headers and delay settings. 

18 :param job_ids: The job ID(s)""" 

19 

20 self.scraper = cloudscraper.create_scraper() 

21 self.job_ids = [job_ids] if isinstance(job_ids, str) else job_ids 

22 self.job_urls = [f"{self.base_url}{job_id}" for job_id in self.job_ids] 

23 

24 def scrape_job_listing(self, job_url: str) -> JobResult: 

25 """Scrape job data from a specific veganjobs.com job listing URL 

26 :param job_url: The URL of the job listing to scrape""" 

27 

28 response = self.scraper.get(job_url) 

29 response.raise_for_status() 

30 

31 soup = BeautifulSoup(response.content, "html.parser") 

32 

33 # Defaults 

34 company = None 

35 title = None 

36 location = None 

37 

38 # Title 

39 title_tag = soup.find("h2", class_="page-title") 

40 if title_tag: 

41 title = title_tag.get_text(strip=True) 

42 

43 # Company 

44 company_tag = soup.find("div", class_="joblisting-meta-company-name") 

45 if company_tag: 

46 company = company_tag.get_text(strip=True) 

47 

48 # Location 

49 location_tag = soup.find("li", class_="location") 

50 if location_tag: 

51 location = location_tag.get_text(strip=True) 

52 

53 # Full text block 

54 container = soup.find("div", class_="job_listing-description") 

55 # noinspection PyArgumentList 

56 text_content = container.get_text(separator="\n", strip=True) if container else "" 

57 

58 # Salary 

59 # salary_match = re.search(r"Salary:\s*(.+)", text_content) 

60 # salary_raw = salary_match.group(1).split("\n")[0] if salary_match else None 

61 

62 # Description (remove salary) 

63 description = re.sub(r"Salary:.*", "", text_content, flags=re.DOTALL).strip() 

64 description = description.strip("Overwiew").strip() 

65 

66 return JobResult( 

67 company=company, 

68 company_id=None, 

69 location=location, 

70 job=JobInfo( 

71 title=title, 

72 description=description, 

73 salary=Salary( 

74 min_amount=None, 

75 max_amount=None, 

76 currency=None, 

77 ), 

78 ), 

79 raw=soup.text, 

80 ) 

81 

82 def scrape_job(self) -> list[JobResult]: 

83 """Scrape a single job listing from the given URL.""" 

84 

85 job_data = [] 

86 for job_url in self.job_urls: 

87 for i in range(50): 

88 try: 

89 job_data.append(self.scrape_job_listing(job_url)) 

90 break 

91 except: 

92 pass 

93 else: 

94 raise AssertionError("Failed to scrape job listing after multiple attempts.") 

95 return job_data 

96 

97 

98if __name__ == "__main__": 

99 scraper = VeganJobsJobScraper("sharpen-strategy-remote-usa-operations-coordinator") 

100 veganjob_data = scraper.scrape_job() 

101 print(veganjob_data)