Coverage for backend/tests/eis/test_email_scraper.py: 99%
460 statements
« prev ^ index » next coverage.py v7.10.7, created at 2025-09-22 15:38 +0000
« prev ^ index » next coverage.py v7.10.7, created at 2025-09-22 15:38 +0000
1"""Test module for email_parser.py functions and GmailScraper class"""
3import datetime
4from unittest.mock import MagicMock, patch
6import pytest
8from app.eis import schemas
9from app.eis.email_scraper import clean_email_address, get_user_id_from_email, GmailScraper
10from app.eis.job_scraper import extract_indeed_jobs_from_email
11from app.eis.models import JobAlertEmail, ScrapedJob
12from tests.conftest import open_file
13from tests.eis.test_job_scraper import MockLinkedinJobScraper, MockIndeedJobScraper
16# ------------------------------------------------------ FIXTURES ------------------------------------------------------
19def create_gmail_scraper(**kwargs) -> GmailScraper:
20 """Create a GmailScraper instance for testing with mocked file dependencies
21 :param kwargs: keyword arguments passed to the GmailScraper constructor"""
23 with (
24 patch("builtins.open", create=True),
25 patch("json.load") as mock_json_load,
26 patch("os.path.exists") as mock_exists,
27 patch("pickle.load"),
28 patch("pickle.dump"),
29 patch("app.eis.email_scraper.build") as mock_build,
30 ):
32 # Mock the secrets file reading
33 mock_json_load.return_value = {
34 "google_auth": {
35 "installed": {
36 "client_id": "test_client_id.apps.googleusercontent.com",
37 "project_id": "test-project",
38 "auth_uri": "https://accounts.google.com/o/oauth2/auth",
39 "token_uri": "https://oauth2.googleapis.com/token",
40 "auth_provider_x509_cert_url": "https://www.googleapis.com/oauth2/v1/certs",
41 "client_secret": "test_client_secret",
42 "redirect_uris": ["http://localhost"],
43 }
44 }
45 }
47 # Mock token file doesn't exist (fresh authentication)
48 mock_exists.return_value = False
50 # Mock Gmail service
51 mock_service = MagicMock()
52 mock_build.return_value = mock_service
54 # Mock the OAuth flow
55 with patch("google_auth_oauthlib.flow.InstalledAppFlow.from_client_config") as mock_flow:
56 mock_credentials = MagicMock()
57 mock_credentials.valid = True
58 mock_flow_instance = MagicMock()
59 mock_flow_instance.run_local_server.return_value = mock_credentials
60 mock_flow.return_value = mock_flow_instance
62 # Create scraper with mocked dependencies
63 scraper = GmailScraper(secrets_file="test_secrets.json", token_file="test_token.json", **kwargs)
65 return scraper
68def mock_get_indeed_redirected_url(url: str) -> str:
69 """Mock function to replace get_indeed_redirected_url"""
71 conv = {
72 "https://uk.indeed.com/pagead/clk/dl?mo=r&ad=-6NYlbfkN0CaUNNDciQjB8b911OChydWlMiE438Jot_lydiWr9Z7lbj9cwyJAEEXhSuW8SoD7Wz1bcqpb5rq8IzPxIcuirUCwOlLSL9SL1F572G6Ye9pXIlV00tsAM20VfzF1b86kTFEpwUl5cqoBjsMlRudbS30FMebfIGC01chUG_dRw15uQJAniZZ9m2OwXKNijACF8VWjBKulQ_zZI6qbz8kD41WGqtaC6lMPRCw5kXUrJbTDCaqSpugfThHENgjlu3j5DBWMjvzWpApXtcxY1NTDKT2jg6q-Z5ZkxpZFWJpPicGjeEfETjD8De3kM__AclzfTjESmozVOJMXW85h3mgPZ94GIuFEx8ppqwDwLENrDoalprKNGMFQOeZ9u9dMbxUX_RJCqW9z1vgoP6UivsqTanzYlukGXOhEQ6IFVnNvDODivSUcZCpO_yBMmxlJxaYuRjPQmnuvS8CFyF8B-M_msQscB4GMRxaiGJuzie7_iJr6nKUP2O7lo1n69wInEp_MnehsLtxzcDysc6eBzfF4v2KkuXm1RRPbFqeIA7TK2sPoy2Z8b3VGKVcWv8k90XwuftkqxlnbbXeP3t1ygWiIMHdoJNVKkxUu46MZXtM498k9txG9p9ByQhDcOI8_BRoVsP3DM1wQl1ang-WkAVoo2PTwmdtETp3VlZZuUfSGtYYEdj-E9JmOVulmnyjbLfssmM%3D&xkcb=SoB56_M3u5Oxdj0MCJ0ObzkdCdPP&camk=UoKtGZLa3XLCRNJifgWECQ%3D%3D&p=0&jsa=1997&rjs=1&tmtk=1j3p3fhn5gc8r800&gdfvj=1&alid=672a6c661e474561bc946956&fvj=1&g1tAS=true": "06498cad9de95b12",
73 "https://uk.indeed.com/pagead/clk/dl?mo=r&ad=-6NYlbfkN0D_vIW1HWJamhhVblwSY9vEnB3YehQDBaLQWgEpQbAFvEB66TXnGDud1dy-8adNNEA8NkJwfd77g5zBB1ZOXhf8PEjWP1V1-Zs6swoSDNPKB4lvVzHxu1T3qM7FYs12eUEkiIA-iiINRZ_P2VMyvYooQezlTWytMkd2UWxnVCG9a3_m1cyaMA7DTm_syy5wCWCpCUUvgVdIOEOARvgAhUnIIz9x2Chk3LMqtby4HJFP4Jl7C-Vi5YB8H0bSA1FeugROif2FHIwU9gEobz-VsFvEz_Z4cCH3oft61BFqWCWU_wWimKzWAcDGINsjLw9tAunN_xjEdupF33Iwcd77c1urVC1OLKbL3-o2oJRyEPfNL1YN7H5cP_VieI3Fir6psGrVHQv_bNy0yYleEmT0E_DofaYunYAnzMqD_SUhvCDHia8MqrGJkTcgJp16KsMZPr5_mVLck5-3PYB-3khV71Oqfoa7q1yRWl-SN-Qfwc2OdZ8zl9PsK42-6iQ34faa2uibd37I4QFVw_Rwx7r8W-xyXpiwfe4xmkhhRGK1DeiQibftk7Dyp41hCpPZbTW_bL5F98fT1mfh1u1enhw3sXxk_BjcAXS_HZpuWi5zMuwbIztF4a8ZtEo_fNdlevRIwrN-0-0qjuEDoJYSxnY3mvd2WkDit7XyYAQWaCBCtSOLVSvgSDi4pd033dZ1KPZD7a0uFkrEyWaWSQ%3D%3D&xkcb=SoBQ6_M3u5Oxdj0MCJ0MbzkdCdPP&camk=ethIe0s0hedS-FZyNnahJA%3D%3D&p=0&jsa=1997&rjs=1&tmtk=1j3p3fhn5gc8r800&gdfvj=1&alid=672a6c661e474561bc946956&fvj=1&g1tAS=true": "42b107e214095d56",
74 "https://uk.indeed.com/pagead/clk/dl?mo=r&ad=-6NYlbfkN0Cf-siO93BSuJ_a-mQFMzVvPBmFGGJg8IeoYoU7n3Hr-wyttwxtthbeGbpHFYWwmmWPWQtznc_slvzvpsaBmSWUWC64QSSNhEuwuNUWHSLtah1bwBpWniJ8vAR5oqbmqlY296quUSNSViPhje6fSFgDWLhGJWLOZaQ6OJRAp-V8a91no5GJKrUzj_KWnmJKR4rz_W6vZS8NYU5v9qDqx0uOlGmg1BnkC5lIZzyqlYwwOiZdPPVaEKKEr_G0GeQvlH67sGm1xTNyJw8sK6-4jN_ENAf2kd7JTexBVkGw5Mo02tAYXFvdA29R0CGRR0lyQRZtFJjgkhZvLHHLYO8JNjy_mia4G2BQ7Sx4ktyjaStia3kR4-BQNNWnr3k3ocyacfQEMHQlqE-Boaf4mwI0-BtJXesJsw9bvP207NBnfZFLJs1hUmSgvHhdYukY2qIsWXJLUVJgOyjwxdLhap0eFBEyti7g0G0mb3e1eO9ATdBP_e0h_p932Dm6wVyAZEXOddagVLoHFiJWPYnq8BUyKvm_S3vp9I57lYRrxWVTKZve2VIP18Uex6Bz0SozYOEEdgfyqQMBRAcp935Hg8aUW8GrXb3Q-js8GxuFke_S_tiEhCyNOEMjhQ-VRl5QOPdFttLD6e9-WR_H8IFLZUu3KwcfMBy1qEq1Tio%3D&xkcb=SoAk6_M3u5Oxdj0MCJ0AbzkdCdPP&camk=ethIe0s0hedep5fbP4CFtg%3D%3D&p=0&jsa=1997&rjs=1&tmtk=1j3p3fhn5gc8r800&gdfvj=1&alid=672a6c661e474561bc946956&fvj=1&g1tAS=true": "14a9001ba6ebb965",
75 "https://uk.indeed.com/pagead/clk/dl?mo=r&ad=-6NYlbfkN0COSBp8KgMXxewvi58QAG0wwdlVlJfveGrD5vFIguWoXakoblclqS-4T_znVTPKawHOSHZOxsl_jK0JZuGPspNA9roT-uonvDv2P6RZVLNvLfm0KdPGmVMWwoNgo5H64KiIVwOuf_UrhuMQzHBJIgwJkroSRqxeEQ_3FKwvys8bTaQ85PMumf55yR90-LeyTGL3GXnHmXVXSfC1MDn6qf5BpprmfFM-RGc2WNblsNn6hNEtF-n7NfrAi-f-PzOE_Fjwhx-Y50MEMdlex_3U6MgwFpw7CADiD1Fch2HOI_bhNgCdt6qoLUO2qEA1AX1Ax0_pwn33z2XS_4FOGRcb4ZGqTii1rx-Elj6c6n-95wiR2sks-xrI0uMrPaE2w8P5k5v6tx1ixIQT9liqyzcXoSS6vzmARulIHV4NUWn0e_K4EvX-A-zYBjcEGSGUrLelauCc21fXrDww_gNV_ZSmedh1M06WDaPc3K_6WYtv6-_kkYQhQJyLlyW0Ws23VNL5nfJygGuW8pXeZhbniMlcDaavPtyGoDp4EWGOAI45uMzcbnJ0UyZcRPmuQxfCD8cFz-lmNle1TxlSWFB7j5QOAIn1UbXcKS7gdbhBijiUJWdSdzfbaPNHZdIPMBs6CDUZT5dPrhj_mtNopw4DVvv-OUOAzOpx9mlyJpr5aE7ivabt7_V3CMtJpw7ieYZ4UBA5ZQQ%3D&xkcb=SoCq6_M3u5Oxdj0MCJ0HbzkdCdPP&camk=UoKtGZLa3XL6dp7SxnkD1A%3D%3D&p=0&jsa=1997&rjs=1&tmtk=1j3p3fhn5gc8r800&gdfvj=1&alid=672a6c661e474561bc946956&fvj=1&g1tAS=true": "eafb032fabcd77bc",
76 "https://uk.indeed.com/pagead/clk/dl?mo=r&ad=-6NYlbfkN0DUGxYnv6px9uI6dWZhSaSeqMgHWZda7534TRDDAqMKu87sK88i_2Gbq8z1VBS-lbE9HOACaDVAT4jwhaVY_xabO_rq24Y_veJqW-7_usP-_0tRugSmofb5DuxCq5IvmHBw1rNykLW3A5edDY3v_jFGsNtRR7fiXWfgXBO9BJc6FCnwMo2I8cy9hPyydcFqH8iy9UHGKCJzlwGZAiKzNQyLn0rE_XB9MXJX9itgkAFNjlDq17qpEbAnLeIOJCcDXQ03H-DIxBN3ycBF9r29kZ45spvjQItrgoMklzXH3jPwU2j7qTpqQxKVcw5xKYuIWDhM5YqzbSTzr7Z97yKVWDKaB7gM87UyTYdJ32cflCxws1brYrULvaC8SfbTlTbsHvAdrl7BHnq6r6j_pBdFDKWUW-HcBCMgYk3ikg7sr5qwJAmQMqMjyLYUfWLVQ2ouX79v1awn5CT_sz7DqSikuv7MUgfzGrvbjHnov-zAxQfFPwdSmWZkgIz7UdZVOXCV0M6bw-XkaWtkDrGyiJRLOmEPNiiNwLnsKek3SWBSR8qHNbsrDWHz391rS2onjNWfo5gnmims0O-R-8jgV2J2NQyYP0ZNTYquIehRay6WTLbEZRsxgCy4Pgz42H-Z71EnOTwqnZ-8qLPoJRHV0K9oMQL6&xkcb=SoC36_M3u5Oxdj0MCJ0ebzkdCdPP&camk=ethIe0s0hefv8CfXU2K9Rw%3D%3D&p=0&jsa=1997&rjs=1&tmtk=1j3p3fhn5gc8r800&gdfvj=1&alid=672a6c661e474561bc946956&fvj=1&g1tAS=true": "5aa22054e7a8b76e",
77 "https://uk.indeed.com/pagead/clk/dl?mo=r&ad=-6NYlbfkN0BqgWWSVbq3rqstnfUzC8xqhdOuKqZ9Avj77mYlc-g-lgy-1FSdO6PyFnAuQRYfp-JTSxMGeZR4wFhLR1UE4XYsePMvv1exKBMkCeCy9Dh-JYDgYqQLDREEwr5Bfy7uoO_og4WXgkp9rnXdiC6ej8lfOCDGtLs0xpRssH8ApFDX2WPI2WZLU3Dr_bYyzL-F51cHyx5ndFwTEKvG8FqgvbkNe1y7DDUUNUQ1EIdLP4bXw1hDuYRjJm9fbGQDc8LmmrzvdE37KxUZqeU3mzGz2moMrdAZPMufhp93UnQ8QmfOD8uq1LGUenfAtLXc7JvOdVmgZkFtGBtdlJ2Dce9Ty8I9XNaZR1vVTXVwfiM9K6yVwKEH5xhUCsr8a3DFXmcVOrivfiMWlzjRM8Bhtnwff6uJ8CLpNr-VdvfAHJTrsflPiwb6FZFX9sKw1kbd-zDyBDq_vEXiJor5MJKcuzQZ2DH62Tgv_dZllHjmGCWfk5775BFywNThFfEpBqM_-8GhAUHBfb6TSXITGIOiwWH6s7fbs7Fhz8wv20YInHAp2vJ--cjK9uVra5jKMPXk8XB1cUTG-ZWtKfzOtVi4TkT5lfFWC12tyMHgv72MFU3YxnXQZrswfP6D5JhZUJM5toctt1AkDeniJsTqR1-JtOeuQaLjQe7KvUV9qJ_ZUXba6qtMvOfz-BCYBDjc&xkcb=SoAq6_M3u5Oxdj0MCJ0dbzkdCdPP&camk=UoKtGZLa3XJTEZOPwEn50w%3D%3D&p=0&jsa=1997&rjs=1&tmtk=1j3p3fhn5gc8r800&gdfvj=1&alid=672a6c661e474561bc946956&fvj=1&g1tAS=true": "ae47862d410bbd39",
78 }
80 return "https://uk.indeed.com/rc/clk/dl?jk=" + conv[url]
83@pytest.fixture(autouse=True)
84def patch_get_indeed_redirected_url(monkeypatch):
85 """Automatically patch get_indeed_redirected_url in all tests to avoid real HTTP requests"""
87 from app.eis.email_scraper import GmailScraper
89 monkeypatch.setattr(GmailScraper, "get_indeed_redirected_url", mock_get_indeed_redirected_url)
92@pytest.fixture
93def gmail_scraper() -> GmailScraper:
94 """Create a GmailScraper instance for testing with mocked file dependencies."""
96 return create_gmail_scraper(skip_indeed_brightapi_scraping=False)
99@pytest.fixture
100def gmail_scraper_with_brightapi_skip() -> GmailScraper:
101 """Create a GmailScraper instance with BrightAPI skip enabled."""
103 return create_gmail_scraper(skip_indeed_brightapi_scraping=True)
106def create_email_data(
107 test_users,
108 filename: str,
109 platform: str,
110 user_index: int,
111) -> schemas.JobAlertEmailCreate:
112 """Create a JobAlertEmailCreate data for testing
113 :param test_users: test users
114 :param filename: file name
115 :param platform: platform name
116 :param user_index: user index"""
118 ofile = open_file(f"{filename}.txt")
119 return schemas.JobAlertEmailCreate(
120 external_email_id=f"{filename}_{platform}_{user_index}",
121 subject="Subject",
122 sender=test_users[user_index].email,
123 date_received=datetime.datetime.now(),
124 platform=platform,
125 body=ofile,
126 )
129# Job ids extracted from the linkedin email body
130LINKEDIN_JOB_IDS = [
131 "4289870503",
132 "4291891707",
133 "4291383265",
134 "4280354992",
135 "4255584864",
136 "4265877117",
137]
139# Job ids extracted from the indeed email body
140INDEED_JOB_IDS = [
141 "8799a57d87058103",
142 "d489097ca0fb185f",
143 "7f9c701ebf265b69",
144 "0537336f99ba1650",
145 "312725e138947a4b",
146 "06498cad9de95b12",
147 "bd60005166216639",
148 "42b107e214095d56",
149 "d30493c008b601e3",
150 "da413431a0c55ec7",
151 "2ed37852402643ab",
152 "14a9001ba6ebb965",
153 "eafb032fabcd77bc",
154 "6838e604ddffd5ac",
155 "227d4ccd0823fc96",
156 "804b940d2d96b30b",
157 "f9aafc9ba4c31c6d",
158 "e034f0b761e410ea",
159 "37cdb0ba59e12295",
160 "7b272f46e4e46a14",
161 "d6110bfb54bdeddb",
162 "5aa22054e7a8b76e",
163 "ae47862d410bbd39",
164]
167@pytest.fixture
168def linkedin_email_data(test_users) -> tuple[schemas.JobAlertEmailCreate, list[str]]:
169 """Create a LinkedIn job alert email record for testing."""
171 return create_email_data(test_users, "linkedin_email", "linkedin", 0), LINKEDIN_JOB_IDS
174@pytest.fixture
175def linkedin_email_data_user2(test_users) -> tuple[schemas.JobAlertEmailCreate, list[str]]:
176 """Create a LinkedIn job alert email record for testing."""
178 return create_email_data(test_users, "linkedin_email", "linkedin", 1), LINKEDIN_JOB_IDS
181@pytest.fixture
182def indeed_email_data(test_users) -> tuple[schemas.JobAlertEmailCreate, list[str]]:
183 """Create an Indeed job alert email record for testing."""
185 return create_email_data(test_users, "indeed_email", "indeed", 0), INDEED_JOB_IDS
188@pytest.fixture
189def indeed_email_data_user2(session, test_users) -> tuple[schemas.JobAlertEmailCreate, list[str]]:
190 """Create an Indeed job alert email record for testing."""
192 return create_email_data(test_users, "indeed_email", "indeed", 1), INDEED_JOB_IDS
195def create_email_record(session, test_users, filename: str, platform: str, user_index: int) -> JobAlertEmail:
196 """Create a ScrapedJob record for testing.
197 :param session: database session
198 :param test_users: test users
199 :param filename: file name
200 :param platform: platform name
201 :param user_index: user index"""
203 email_data = create_email_data(test_users, filename, platform, user_index)
204 # noinspection PyArgumentList
205 email_record = JobAlertEmail(**email_data.model_dump(), owner_id=test_users[user_index].id)
206 session.add(email_record)
207 session.commit()
208 return email_record
211@pytest.fixture
212def linkedin_email_record(session, test_users):
213 """Create a LinkedIn job alert email record for testing."""
215 return create_email_record(session, test_users, "linkedin_email", "linkedin", 0), LINKEDIN_JOB_IDS
218@pytest.fixture
219def linkedin_email_record_user2(session, test_users) -> tuple[schemas.JobAlertEmailCreate, list[str]]:
220 """Create a LinkedIn job alert email record for testing."""
222 return create_email_record(session, test_users, "linkedin_email", "linkedin", 1), LINKEDIN_JOB_IDS
225@pytest.fixture
226def indeed_email_record(session, test_users) -> tuple[schemas.JobAlertEmailCreate, list[str]]:
227 """Create an Indeed job alert email record for testing."""
229 return create_email_record(session, test_users, "indeed_email", "indeed", 0), INDEED_JOB_IDS
232@pytest.fixture
233def indeed_email_record_user2(session, test_users) -> tuple[schemas.JobAlertEmailCreate, list[str]]:
234 """Create an Indeed job alert email record for testing."""
236 return create_email_record(session, test_users, "indeed_email", "indeed", 1), INDEED_JOB_IDS
239# --------------------------------------------------- BASE FUNCTIONS ---------------------------------------------------
242class TestCleanEmailAddress:
243 """Test class for clean_email_address function"""
245 @pytest.mark.parametrize(
246 "sender_field,expected",
247 [
248 ("John Doe <john.doe@gmail.com>", "john.doe@gmail.com"),
249 ("john.doe@gmail.com", "john.doe@gmail.com"),
250 ('"John Doe" <john.doe@gmail.com>', "john.doe@gmail.com"),
251 ("Test User <TEST.USER@EXAMPLE.COM>", "test.user@example.com"),
252 (" test@example.com ", "test@example.com"),
253 ("Invalid Format", "invalid"),
254 ("Jane Smith <jane.smith+tag@company.co.uk>", "jane.smith+tag@company.co.uk"),
255 ("Multiple Words Name <multi.word@domain.org>", "multi.word@domain.org"),
256 ],
257 )
258 def test_clean_email_address(self, sender_field, expected) -> None:
259 """Test email address cleaning with various formats"""
261 result = clean_email_address(sender_field)
262 assert result == expected
265class TestGetUserIdFromEmail:
266 """Test class for get_user_id_from_email function"""
268 def test_get_user_id_existing_user(self, session, test_users) -> None:
269 """Test getting user ID for existing user"""
271 test_user = test_users[0]
272 result = get_user_id_from_email(test_user.email, session)
273 assert result == test_user.id
275 def test_get_user_id_non_existing_user(self, session) -> None:
276 """Test getting user ID for non-existing user returns default ID 1"""
278 with pytest.raises(AssertionError):
279 get_user_id_from_email("nonexistent@example.com", session)
281 def test_get_user_id_empty_email(self, session) -> None:
282 """Test getting user ID with empty email"""
284 with pytest.raises(AssertionError):
285 get_user_id_from_email("", session)
287 def test_get_user_id_case_sensitivity(self, session, test_users) -> None:
288 """Test that email lookup is case-sensitive (as per database collation)"""
290 test_user = test_users[0]
291 upper_email = test_user.email.upper()
292 with pytest.raises(AssertionError):
293 get_user_id_from_email(upper_email, session)
296# --------------------------------------------- GMAILSCRAPER STATIC METHODS --------------------------------------------
299class TestSaveEmailToDb:
300 """Test class for GmailScraper.save_email_to_db method"""
302 def test_save_new_email_success(self, linkedin_email_data, test_service_logs, session, test_users) -> None:
303 """Test saving a new email successfully"""
305 linkedin_email_data = linkedin_email_data[0]
306 result_email, is_created = GmailScraper.save_email_to_db(linkedin_email_data, test_service_logs[0].id, session)
308 assert is_created is True
309 assert result_email.external_email_id == linkedin_email_data.external_email_id
310 assert result_email.subject == linkedin_email_data.subject
311 assert result_email.sender == linkedin_email_data.sender
312 assert result_email.platform == linkedin_email_data.platform
313 assert result_email.body == linkedin_email_data.body
314 assert result_email.owner_id == test_users[0].id
315 assert result_email.service_log_id == test_service_logs[0].id
317 # Verify it's actually in the database
318 # noinspection PyTypeChecker
319 db_email = (
320 session.query(JobAlertEmail)
321 .filter(JobAlertEmail.external_email_id == linkedin_email_data.external_email_id)
322 .first()
323 )
324 assert db_email is not None
325 assert db_email.id == result_email.id
327 def test_save_existing_email_returns_existing(
328 self, linkedin_email_data, test_service_logs, session, test_users
329 ) -> None:
330 """Test that existing email is returned without creating a new record"""
332 # noinspection PyArgumentList
333 existing_email = JobAlertEmail(
334 external_email_id=linkedin_email_data[0].external_email_id,
335 subject="Different Subject",
336 sender="different@example.com",
337 owner_id=test_users[0].id,
338 service_log_id=test_service_logs[0].id,
339 )
340 session.add(existing_email)
341 session.commit()
343 result_email, is_created = GmailScraper.save_email_to_db(
344 linkedin_email_data[0], test_service_logs[0].id, session
345 )
347 assert is_created is False
348 assert result_email.id == existing_email.id
349 assert result_email.subject == "Different Subject" # Original data preserved
351 # Verify only one record exists
352 # noinspection PyTypeChecker
353 email_count = (
354 session.query(JobAlertEmail)
355 .filter(JobAlertEmail.external_email_id == linkedin_email_data[0].external_email_id)
356 .count()
357 )
358 assert email_count == 1
361class TestExtractLinkedinJobIds:
362 """Test class for GmailScraper.extract_linkedin_job_ids method"""
364 def test_extract_linkedin_job_ids_real_email(self, linkedin_email_data) -> None:
365 """Test extracting LinkedIn job IDs from real LinkedIn email content"""
367 job_ids = GmailScraper.extract_linkedin_job_ids(linkedin_email_data[0].body)
369 assert len(job_ids) == 6
370 assert job_ids == LINKEDIN_JOB_IDS
372 def test_extract_linkedin_job_ids_empty_body(self) -> None:
373 """Test extracting job IDs from empty body"""
375 job_ids = GmailScraper.extract_linkedin_job_ids("")
376 assert job_ids == []
378 def test_extract_linkedin_job_ids_no_jobs(self) -> None:
379 """Test extracting job IDs from body with no LinkedIn job URLs"""
381 body = """
382 This is a test email with no LinkedIn job URLs.
383 It contains some other URLs like:
384 - https://www.google.com
385 - https://www.example.com
386 - https://www.linkedin.com/profile/some-user
387 But no job view URLs.
388 """
390 job_ids = GmailScraper.extract_linkedin_job_ids(body)
391 assert job_ids == []
393 @pytest.mark.parametrize(
394 "url_pattern,expected_id",
395 [
396 ("https://www.linkedin.com/jobs/view/1234567890", "1234567890"),
397 ("https://www.linkedin.com/comm/jobs/view/9876543210", "9876543210"),
398 ("HTTPS://WWW.LINKEDIN.COM/JOBS/VIEW/5555555555", "5555555555"),
399 ("https://linkedin.com/jobs/view/1111111111", "1111111111"),
400 ("http://www.linkedin.com/jobs/view/2222222222", "2222222222"),
401 ],
402 )
403 def test_extract_linkedin_job_ids_url_variations(self, url_pattern, expected_id) -> None:
404 """Test extracting job IDs from various URL patterns"""
406 body = f"Check out this job: {url_pattern}"
408 job_ids = GmailScraper.extract_linkedin_job_ids(body)
410 assert len(job_ids) == 1
411 assert job_ids[0] == expected_id
413 def test_extract_linkedin_job_ids_with_duplicate_ids(self) -> None:
414 """Test that duplicate job IDs are removed"""
416 body = """
417 Job 1: https://www.linkedin.com/jobs/view/1111111111
418 Job 2: https://www.linkedin.com/jobs/view/2222222222
419 Job 3: https://www.linkedin.com/jobs/view/1111111111
420 Job 4: https://www.linkedin.com/jobs/view/3333333333
421 Job 5: https://www.linkedin.com/jobs/view/2222222222
422 """
424 job_ids = GmailScraper.extract_linkedin_job_ids(body)
426 assert len(job_ids) == 3
427 assert job_ids == ["1111111111", "2222222222", "3333333333"]
429 def test_extract_linkedin_job_ids_with_query_parameters(self) -> None:
430 """Test extracting job IDs from URLs with query parameters (like the real email)"""
432 body = """
433 View job: https://www.linkedin.com/comm/jobs/view/4289870503/?trackingId=tt9C%2FzqOXzxRyy9uU5vDOw%3D%3D&refId=something
434 Another job: https://www.linkedin.com/jobs/view/1234567890?ref=email&source=alert
435 """
437 job_ids = GmailScraper.extract_linkedin_job_ids(body)
439 assert len(job_ids) == 2
440 assert "4289870503" in job_ids
441 assert "1234567890" in job_ids
443 def test_extract_linkedin_job_ids_malformed_urls(self) -> None:
444 """Test that malformed LinkedIn URLs are ignored"""
446 body = """
447 Good URL: https://www.linkedin.com/jobs/view/1111111111
448 Malformed: https://www.linkedin.com/jobs/view/
449 Malformed: https://www.linkedin.com/jobs/view/abcd
450 Another good: https://www.linkedin.com/jobs/view/2222222222
451 """
453 job_ids = GmailScraper.extract_linkedin_job_ids(body)
455 assert len(job_ids) == 2
456 assert job_ids == ["1111111111", "2222222222"]
459class TestExtractIndeedJobIds:
460 """Test class for GmailScraper.extract_indeed_job_ids method"""
462 def test_extract_indeed_job_ids_real_email(self, indeed_email_data) -> None:
463 """Test extracting Indeed job IDs from real Indeed email content"""
465 job_ids = GmailScraper.extract_indeed_job_ids(indeed_email_data[0].body)
466 assert job_ids == INDEED_JOB_IDS
468 def test_extract_indeed_job_ids_empty_body(self) -> None:
469 """Test extracting job IDs from empty body"""
471 job_ids = GmailScraper.extract_indeed_job_ids("")
472 assert job_ids == []
474 def test_extract_indeed_job_ids_no_jobs(self) -> None:
475 """Test extracting job IDs from body with no Indeed job URLs"""
477 body = """
478 This is a test email with no Indeed job URLs.
479 It contains some other URLs like:
480 - https://www.google.com
481 - https://www.example.com
482 - https://www.indeed.com/profile/some-user
483 But no job view URLs.
484 """
486 job_ids = GmailScraper.extract_indeed_job_ids(body)
487 assert job_ids == []
489 @pytest.mark.parametrize(
490 "url_pattern,expected_id",
491 [
492 ("https://uk.indeed.com/rc/clk/dl?jk=1234567890abcdef&from=ja", "1234567890abcdef"),
493 ("HTTPS://UK.INDEED.COM/RC/CLK/DL?JK=5555555555AAAA&FROM=JA", "5555555555AAAA"),
494 ("http://indeed.com/rc/clk/dl?jk=1111111111bbbb&other=param", "1111111111bbbb"),
495 ],
496 )
497 def test_extract_indeed_job_ids_url_variations(self, url_pattern, expected_id) -> None:
498 """Test extracting job IDs from various URL patterns"""
500 body = f"Check out this job: {url_pattern}"
502 job_ids = GmailScraper.extract_indeed_job_ids(body)
504 assert len(job_ids) == 1
505 assert job_ids[0] == expected_id
507 def test_extract_indeed_job_ids_with_duplicate_ids(self) -> None:
508 """Test that duplicate job IDs are removed"""
510 body = """
511 Job 1: https://uk.indeed.com/rc/clk/dl?jk=1111111111aaa&from=ja
512 Job 2: https://uk.indeed.com/rc/clk/dl?jk=2222222222bbb&from=ja
513 Job 3: https://uk.indeed.com/rc/clk/dl?jk=1111111111aaa&from=ja
514 Job 4: https://uk.indeed.com/rc/clk/dl?jk=3333333333ccc&from=ja
515 Job 5: https://uk.indeed.com/rc/clk/dl?jk=2222222222bbb&from=ja
516 """
518 job_ids = GmailScraper.extract_indeed_job_ids(body)
520 assert len(job_ids) == 3
521 assert job_ids == ["1111111111aaa", "2222222222bbb", "3333333333ccc"]
523 def test_extract_indeed_job_ids_malformed_urls(self) -> None:
524 """Test that malformed Indeed URLs are ignored"""
526 body = """
527 Good URL: https://uk.indeed.com/rc/clk/dl?jk=1111111111aaa&from=ja
528 Malformed: https://uk.indeed.com/rc/clk/dl?from=ja
529 Malformed: https://uk.indeed.com/rc/clk/dl?jk=
530 Another good: https://uk.indeed.com/rc/clk/dl?jk=2222222222bbb&from=ja
531 """
533 job_ids = GmailScraper.extract_indeed_job_ids(body)
535 assert len(job_ids) == 2
536 assert job_ids == ["1111111111aaa", "2222222222bbb"]
539class TestSaveJobsToDb:
540 """Test class for GmailScraper.save_jobs_to_db method"""
542 def test_save_new_jobs_success(self, test_job_alert_emails, session, test_users) -> None:
543 """Test saving new job IDs successfully"""
545 job_ids = ["job_123", "job_456", "job_789"]
547 result = GmailScraper.save_jobs_to_db(email_record=test_job_alert_emails[0], job_ids=job_ids, db=session)
549 # Verify returned list has correct length
550 assert len(result) == 3
552 # Verify all jobs are ScrapedJob instances
553 for job_record in result:
554 assert job_record.owner_id == test_users[0].id
555 assert job_record.external_job_id in job_ids
556 assert test_job_alert_emails[0] in job_record.emails
558 def test_save_existing_jobs_returns_existing(self, test_job_alert_emails, session, test_users) -> None:
559 """Test that existing jobs are returned without creating duplicates"""
561 # Create existing jobs
562 # noinspection PyArgumentList
563 existing_job = ScrapedJob(external_job_id="existing_job_123", owner_id=test_users[0].id)
564 session.add(existing_job)
565 session.commit()
566 session.refresh(existing_job)
568 job_ids = ["existing_job_123", "new_job_456"]
570 result = GmailScraper.save_jobs_to_db(email_record=test_job_alert_emails[0], job_ids=job_ids, db=session)
572 # Verify returned list has correct length
573 assert len(result) == 2
575 def test_save_jobs_different_owners(self, test_job_alert_emails, session, test_users) -> None:
576 """Test that jobs with same external_job_id but different owners are created separately"""
578 assert test_job_alert_emails[0].owner_id != test_job_alert_emails[-1].owner_id
580 # Save same job ID for both users
581 job_ids = ["same_job_123"]
583 result_1 = GmailScraper.save_jobs_to_db(email_record=test_job_alert_emails[0], job_ids=job_ids, db=session)
585 result_2 = GmailScraper.save_jobs_to_db(email_record=test_job_alert_emails[-1], job_ids=job_ids, db=session)
587 # Verify separate job records were created for each owner
588 assert len(result_1) == 1
589 assert len(result_2) == 1
590 assert result_1[0].id != result_2[0].id
591 assert result_1[0].owner_id == test_users[0].id
592 assert result_2[0].owner_id == test_users[1].id
594 # Verify both have the same external job ID
595 assert result_1[0].external_job_id == "same_job_123"
596 assert result_2[0].external_job_id == "same_job_123"
598 # Verify total count in the database
599 total_jobs = session.query(ScrapedJob).count()
600 assert total_jobs == 2
603class TestSaveJobDataToDb:
604 """Test class for GmailScraper.save_job_data_to_db method"""
606 @pytest.fixture
607 def sample_job_data(self) -> dict:
608 """Sample job data in the expected format"""
610 return {
611 "company": "Test Company Ltd",
612 "location": "London, UK",
613 "job": {
614 "title": "Senior Software Engineer",
615 "description": "We are looking for a senior software engineer to join our team...",
616 "url": "https://example.com/job/123",
617 "salary": {"min_amount": 50000.0, "max_amount": 70000.0},
618 },
619 }
621 @pytest.fixture
622 def sample_scraped_job(self, session, test_users) -> ScrapedJob:
623 """Create a sample scraped job record"""
625 # noinspection PyArgumentList
626 job = ScrapedJob(
627 external_job_id="test_job_123",
628 owner_id=test_users[0].id,
629 )
630 session.add(job)
631 session.commit()
632 session.refresh(job)
633 return job
635 def test_save_job_data_single_job_and_data(self, sample_scraped_job, sample_job_data, session) -> None:
636 """Test saving job data to a single job record"""
638 # Verify initial state
639 assert sample_scraped_job.is_scraped is False
640 assert sample_scraped_job.title is None
641 assert sample_scraped_job.company is None
643 # Save job data
644 GmailScraper.save_job_data_to_db(
645 job_records=sample_scraped_job, job_data=sample_job_data, db=session, scraped_date=datetime.datetime.now()
646 )
648 # Refresh the record from database
649 session.refresh(sample_scraped_job)
651 # Verify the data was saved correctly
652 assert sample_scraped_job.is_scraped is True
653 assert sample_scraped_job.company == sample_job_data["company"]
654 assert sample_scraped_job.location == sample_job_data["location"]
655 assert sample_scraped_job.title == sample_job_data["job"]["title"]
656 assert sample_scraped_job.description == sample_job_data["job"]["description"]
657 assert sample_scraped_job.url == sample_job_data["job"]["url"]
658 assert sample_scraped_job.salary_min == sample_job_data["job"]["salary"]["min_amount"]
659 assert sample_scraped_job.salary_max == sample_job_data["job"]["salary"]["max_amount"]
661 def test_save_job_data_multiple_jobs_and_data(self, session, test_users) -> None:
662 """Test saving job data to multiple job records"""
664 # Create multiple job records
665 # noinspection PyArgumentList
666 job_1 = ScrapedJob(
667 external_job_id="job_1",
668 owner_id=test_users[0].id,
669 is_scraped=False,
670 )
671 # noinspection PyArgumentList
672 job_2 = ScrapedJob(
673 external_job_id="job_2",
674 owner_id=test_users[0].id,
675 is_scraped=False,
676 )
677 session.add_all([job_1, job_2])
678 session.commit()
679 session.refresh(job_1)
680 session.refresh(job_2)
682 # Create multiple job data entries
683 job_data_1 = {
684 "company": "Company A",
685 "location": "London, UK",
686 "job": {
687 "title": "Developer A",
688 "description": "Description A",
689 "url": "https://example.com/job/a",
690 "salary": {"min_amount": 40000.0, "max_amount": 60000.0},
691 },
692 }
694 job_data_2 = {
695 "company": "Company B",
696 "location": "Manchester, UK",
697 "job": {
698 "title": "Developer B",
699 "description": "Description B",
700 "url": "https://example.com/job/b",
701 "salary": {"min_amount": 45000.0, "max_amount": 65000.0},
702 },
703 }
705 # Save job data
706 GmailScraper.save_job_data_to_db(
707 job_records=[job_1, job_2],
708 job_data=[job_data_1, job_data_2],
709 db=session,
710 scraped_date=datetime.datetime.now(),
711 )
713 # Refresh records
714 session.refresh(job_1)
715 session.refresh(job_2)
717 # Verify first job
718 assert job_1.is_scraped is True
719 assert job_1.company == "Company A"
720 assert job_1.title == "Developer A"
721 assert job_1.salary_min == 40000.0
722 assert job_1.salary_max == 60000.0
724 # Verify second job
725 assert job_2.is_scraped is True
726 assert job_2.company == "Company B"
727 assert job_2.title == "Developer B"
728 assert job_2.salary_min == 45000.0
729 assert job_2.salary_max == 65000.0
732# ------------------------------------------------ GMAILSCRAPER METHODS ------------------------------------------------
735class TestProcessEmailJobs:
736 """Test suite for the _process_email_jobs method."""
738 def test_process_linkedin_email_jobs_success(
739 self,
740 gmail_scraper,
741 session,
742 linkedin_email_record,
743 test_service_logs,
744 ) -> None:
745 """Test successful processing of LinkedIn email job ids"""
747 gmail_scraper._process_email(
748 db=session,
749 email_record=linkedin_email_record[0],
750 service_log_entry=test_service_logs[0],
751 )
753 # noinspection PyTypeChecker
754 scraped_jobs = session.query(ScrapedJob).filter(ScrapedJob.owner_id == linkedin_email_record[0].owner_id).all()
755 assert len(scraped_jobs) == len(linkedin_email_record[1])
757 def test_process_indeed_email_jobs_success(
758 self,
759 gmail_scraper,
760 session,
761 indeed_email_record,
762 test_service_logs,
763 ) -> None:
764 """Test successful processing of Indeed email jobs."""
766 gmail_scraper._process_email(
767 db=session,
768 email_record=indeed_email_record[0],
769 service_log_entry=test_service_logs[0],
770 )
772 # noinspection PyTypeChecker
773 scraped_jobs = session.query(ScrapedJob).filter(ScrapedJob.owner_id == indeed_email_record[0].owner_id).all()
774 assert len(scraped_jobs) == len(indeed_email_record[1])
776 def test_process_indeed_email_jobs_success_no_brightapi(
777 self,
778 gmail_scraper_with_brightapi_skip,
779 session,
780 indeed_email_record,
781 test_service_logs,
782 ) -> None:
783 """Test successful processing of Indeed email jobs."""
785 result = gmail_scraper_with_brightapi_skip._process_email(
786 db=session,
787 email_record=indeed_email_record[0],
788 service_log_entry=test_service_logs[0],
789 )
791 # noinspection PyTypeChecker
792 scraped_jobs = session.query(ScrapedJob).filter(ScrapedJob.owner_id == indeed_email_record[0].owner_id).all()
793 assert len(scraped_jobs) == len(indeed_email_record[1])
794 assert len(result) == len(indeed_email_record[1])
796 def test_process_linkedin_email_jobs_success_duplicates_different_owners(
797 self,
798 gmail_scraper,
799 session,
800 linkedin_email_record,
801 linkedin_email_record_user2,
802 test_service_logs,
803 ) -> None:
804 """Test successful processing of LinkedIn email job ids"""
806 gmail_scraper._process_email(
807 db=session,
808 email_record=linkedin_email_record[0],
809 service_log_entry=test_service_logs[0],
810 )
812 gmail_scraper._process_email(
813 db=session,
814 email_record=linkedin_email_record_user2[0],
815 service_log_entry=test_service_logs[0],
816 )
818 # noinspection PyTypeChecker
819 scraped_jobs = session.query(ScrapedJob).filter(ScrapedJob.owner_id == linkedin_email_record[0].owner_id).all()
820 assert len(scraped_jobs) == len(linkedin_email_record[1])
821 # noinspection PyTypeChecker
822 scraped_jobs = (
823 session.query(ScrapedJob).filter(ScrapedJob.owner_id == linkedin_email_record_user2[0].owner_id).all()
824 )
825 assert len(scraped_jobs) == len(linkedin_email_record_user2[1])
827 def test_process_linkedin_email_jobs_success_duplicates_same_owner(
828 self,
829 gmail_scraper,
830 session,
831 linkedin_email_record,
832 test_service_logs,
833 ) -> None:
834 """Test successful processing of LinkedIn email job ids"""
836 gmail_scraper._process_email(
837 db=session,
838 email_record=linkedin_email_record[0],
839 service_log_entry=test_service_logs[0],
840 )
842 gmail_scraper._process_email(
843 db=session,
844 email_record=linkedin_email_record[0],
845 service_log_entry=test_service_logs[0],
846 )
848 # noinspection PyTypeChecker
849 scraped_jobs = session.query(ScrapedJob).filter(ScrapedJob.owner_id == linkedin_email_record[0].owner_id).all()
850 assert len(scraped_jobs) == len(linkedin_email_record[1])
853class TestProcessUserEmails:
854 """Test class for GmailScraper._process_user_emails method"""
856 def test_single_user(
857 self,
858 gmail_scraper,
859 session,
860 test_users,
861 test_service_logs,
862 linkedin_email_data,
863 ) -> None:
864 """Test successful processing of emails for a single user with LinkedIn email"""
866 # Mock get_email_ids to return emails only for first user
867 with (
868 patch.object(gmail_scraper, "get_email_ids") as mock_get_email_ids,
869 patch.object(gmail_scraper, "get_email_data") as mock_get_email_data,
870 ):
872 # Setup mocks to be user-dependent
873 def mock_get_email_ids_side_effect(user_email, _inbox_only, _timedelta_days) -> list[str]:
874 """Mock get_email_ids to return emails only for first user"""
875 if user_email == test_users[0].email:
876 return [linkedin_email_data[0].external_email_id]
877 else:
878 return []
880 def mock_get_email_data_side_effect(_email_id, user_email) -> schemas.JobAlertEmailCreate:
881 """Mock get_email_data to return emails only for first user"""
882 if user_email == test_users[0].email:
883 return linkedin_email_data[0]
884 raise ValueError(f"Unexpected call for user {user_email}")
886 mock_get_email_ids.side_effect = mock_get_email_ids_side_effect
887 mock_get_email_data.side_effect = mock_get_email_data_side_effect
889 # Call the method
890 result = gmail_scraper._process_user_emails(
891 db=session, timedelta_days=1, service_log_entry=test_service_logs[0]
892 )
894 # Verify service log updates
895 assert test_service_logs[0].users_processed_n == len(test_users)
896 assert test_service_logs[0].emails_found_n == 1
897 assert test_service_logs[0].emails_saved_n == 1
899 # Verify email was saved to database
900 # noinspection PyTypeChecker
901 saved_emails = (
902 session.query(JobAlertEmail)
903 .filter(JobAlertEmail.external_email_id == linkedin_email_data[0].external_email_id)
904 .all()
905 )
906 assert len(saved_emails) == 1
907 assert saved_emails[0].platform == linkedin_email_data[0].platform
909 # Verify jobs were created only for the first user
910 # noinspection PyTypeChecker
911 user1_jobs = session.query(ScrapedJob).filter(ScrapedJob.owner_id == test_users[0].id).all()
912 assert len(user1_jobs) == len(linkedin_email_data[1])
914 # Verify no jobs for other users
915 for i in range(1, len(test_users)):
916 # noinspection PyTypeChecker
917 user_jobs = session.query(ScrapedJob).filter(ScrapedJob.owner_id == test_users[i].id).all()
918 assert len(user_jobs) == 0
920 # Verify empty result (no job data for LinkedIn without scraping)
921 assert result == {}
923 def test_multiple_users(
924 self,
925 gmail_scraper,
926 session,
927 test_users,
928 test_service_logs,
929 linkedin_email_data,
930 indeed_email_data_user2,
931 ) -> None:
932 """Test successful processing of emails for multiple users with different email types"""
934 with (
935 patch.object(gmail_scraper, "get_email_ids") as mock_get_email_ids,
936 patch.object(gmail_scraper, "get_email_data") as mock_get_email_data,
937 ):
939 # Setup mocks to return different emails for different users
940 def mock_get_email_ids_side_effect(user_email, _inbox_only, _timedelta_days) -> list[str]:
941 """Mock get_email_ids() to return a list of email IDs for each user"""
942 if user_email == test_users[0].email:
943 return [linkedin_email_data[0].external_email_id]
944 elif user_email == test_users[1].email:
945 return [indeed_email_data_user2[0].external_email_id]
946 return []
948 def mock_get_email_data_side_effect(email_id, user_email) -> schemas.JobAlertEmailCreate:
949 """Mock get_email_data() to return the email data for each user"""
950 if user_email == test_users[0].email:
951 return linkedin_email_data[0]
952 elif user_email == test_users[1].email:
953 return indeed_email_data_user2[0]
954 raise ValueError(f"Unexpected call for user {user_email} and email {email_id}")
956 mock_get_email_ids.side_effect = mock_get_email_ids_side_effect
957 mock_get_email_data.side_effect = mock_get_email_data_side_effect
959 # Call the method
960 gmail_scraper._process_user_emails(db=session, timedelta_days=2, service_log_entry=test_service_logs[0])
962 # Verify service log updates
963 assert test_service_logs[0].users_processed_n == len(test_users)
964 assert test_service_logs[0].emails_found_n == 2
965 assert test_service_logs[0].emails_saved_n == 2
966 assert test_service_logs[0].linkedin_job_n == len(linkedin_email_data[1])
967 assert test_service_logs[0].indeed_job_n == len(indeed_email_data_user2[1])
968 assert test_service_logs[0].jobs_extracted_n == len(linkedin_email_data[1]) + len(
969 indeed_email_data_user2[1]
970 )
972 # Verify jobs were created for appropriate users
973 # noinspection PyTypeChecker
974 user1_jobs = session.query(ScrapedJob).filter(ScrapedJob.owner_id == test_users[0].id).all()
975 # noinspection PyTypeChecker
976 user2_jobs = session.query(ScrapedJob).filter(ScrapedJob.owner_id == test_users[1].id).all()
977 assert len(user1_jobs) == len(linkedin_email_data[1])
978 assert len(user2_jobs) == len(indeed_email_data_user2[1])
980 # Verify no jobs for remaining users (if any)
981 for i in range(2, len(test_users)):
982 # noinspection PyTypeChecker
983 user_jobs = session.query(ScrapedJob).filter(ScrapedJob.owner_id == test_users[i].id).all()
984 assert len(user_jobs) == 0
986 def test_multiple_users_same_jobs(
987 self,
988 gmail_scraper,
989 session,
990 test_users,
991 test_service_logs,
992 linkedin_email_data,
993 linkedin_email_data_user2,
994 ) -> None:
995 """Test successful processing of emails for multiple users with different email types"""
997 with (
998 patch.object(gmail_scraper, "get_email_ids") as mock_get_email_ids,
999 patch.object(gmail_scraper, "get_email_data") as mock_get_email_data,
1000 ):
1002 # Setup mocks to return different emails for different users
1003 def mock_get_email_ids_side_effect(user_email, _inbox_only, _timedelta_days) -> list[str]:
1004 """Mock function to return different emails for different users"""
1005 if user_email == test_users[0].email:
1006 return [linkedin_email_data[0].external_email_id]
1007 elif user_email == test_users[1].email:
1008 return [linkedin_email_data_user2[0].external_email_id]
1009 return []
1011 def mock_get_email_data_side_effect(email_id, user_email) -> schemas.JobAlertEmailCreate:
1012 """Mock method to return job data for a given email ID and user email"""
1013 if user_email == test_users[0].email:
1014 return linkedin_email_data[0]
1015 elif user_email == test_users[1].email:
1016 return linkedin_email_data_user2[0]
1017 raise ValueError(f"Unexpected call for user {user_email} and email {email_id}")
1019 mock_get_email_ids.side_effect = mock_get_email_ids_side_effect
1020 mock_get_email_data.side_effect = mock_get_email_data_side_effect
1022 # Call the method
1023 gmail_scraper._process_user_emails(db=session, timedelta_days=2, service_log_entry=test_service_logs[0])
1025 # Verify service log updates
1026 assert test_service_logs[0].users_processed_n == len(test_users)
1027 assert test_service_logs[0].emails_found_n == 2
1028 assert test_service_logs[0].emails_saved_n == 2
1029 assert test_service_logs[0].linkedin_job_n == len(linkedin_email_data[1]) + len(
1030 linkedin_email_data_user2[1]
1031 )
1032 assert test_service_logs[0].indeed_job_n == 0
1033 assert test_service_logs[0].jobs_extracted_n == len(linkedin_email_data[1]) + len(
1034 linkedin_email_data_user2[1]
1035 )
1037 # Verify jobs were created for appropriate users
1038 # noinspection PyTypeChecker
1039 user1_jobs = session.query(ScrapedJob).filter(ScrapedJob.owner_id == test_users[0].id).all()
1040 # noinspection PyTypeChecker
1041 user2_jobs = session.query(ScrapedJob).filter(ScrapedJob.owner_id == test_users[1].id).all()
1042 assert len(user1_jobs) == len(linkedin_email_data[1])
1043 assert len(user2_jobs) == len(linkedin_email_data_user2[1])
1045 # Verify no jobs for remaining users (if any)
1046 for i in range(2, len(test_users)):
1047 # noinspection PyTypeChecker
1048 user_jobs = session.query(ScrapedJob).filter(ScrapedJob.owner_id == test_users[i].id).all()
1049 assert len(user_jobs) == 0
1051 def test_skip_brightdata(
1052 self,
1053 gmail_scraper_with_brightapi_skip,
1054 session,
1055 test_users,
1056 test_service_logs,
1057 indeed_email_data,
1058 ) -> None:
1059 """Test successful processing of emails for multiple users with different email types"""
1061 with (
1062 patch.object(gmail_scraper_with_brightapi_skip, "get_email_ids") as mock_get_email_ids,
1063 patch.object(gmail_scraper_with_brightapi_skip, "get_email_data") as mock_get_email_data,
1064 ):
1066 # Setup mocks to return different emails for different users
1067 def mock_get_email_ids_side_effect(user_email, _inbox_only, _timedelta_days) -> list[str]:
1068 """Mock get_email_ids method to return emails only for first user"""
1069 if user_email == test_users[0].email:
1070 return [indeed_email_data[0].external_email_id]
1071 return []
1073 def mock_get_email_data_side_effect(email_id, user_email) -> schemas.JobAlertEmailCreate:
1074 """Mock get_email_data method to return email data only for first user"""
1075 if user_email == test_users[0].email:
1076 return indeed_email_data[0]
1077 raise ValueError(f"Unexpected call for user {user_email} and email {email_id}")
1079 mock_get_email_ids.side_effect = mock_get_email_ids_side_effect
1080 mock_get_email_data.side_effect = mock_get_email_data_side_effect
1082 # Call the method
1083 result = gmail_scraper_with_brightapi_skip._process_user_emails(
1084 db=session, timedelta_days=2, service_log_entry=test_service_logs[0]
1085 )
1087 assert len(result) == 23
1090class TestScrapeRemainingJobs:
1091 """Test cases for the _scrape_remaining_jobs method"""
1093 @staticmethod
1094 def _scraped_jobs(session, email_record) -> list[ScrapedJob]:
1095 """Fixture to create Indeed scraped jobs for multiple users"""
1097 scraped_jobs = []
1098 owner_id = email_record[0].owner_id
1099 for job_id in email_record[1]:
1100 # noinspection PyArgumentList
1101 scraped_job = ScrapedJob(external_job_id=job_id, owner_id=owner_id)
1102 scraped_job.emails.append(email_record[0])
1103 session.add(scraped_job)
1104 scraped_jobs.append(scraped_job)
1105 session.commit()
1106 return scraped_jobs
1108 @pytest.fixture
1109 def indeed_scraped_jobs(self, test_users, session, indeed_email_record) -> list[ScrapedJob]:
1110 """Fixture to create Indeed scraped jobs for multiple users"""
1112 return self._scraped_jobs(session, indeed_email_record)
1114 @pytest.fixture
1115 def indeed_scraped_jobs_user2(self, test_users, session, indeed_email_record_user2) -> list[ScrapedJob]:
1116 """Fixture to create Indeed scraped jobs for multiple users"""
1118 return self._scraped_jobs(session, indeed_email_record_user2)
1120 @pytest.fixture
1121 def linkedin_scraped_jobs(self, test_users, session, linkedin_email_record) -> list[ScrapedJob]:
1122 """Fixture to create Indeed scraped jobs for multiple users"""
1124 return self._scraped_jobs(session, linkedin_email_record)
1126 def test_indeed_success(
1127 self,
1128 indeed_scraped_jobs,
1129 test_service_logs,
1130 gmail_scraper,
1131 session,
1132 ) -> None:
1133 """Test successful processing of Indeed email jobs"""
1135 with patch("app.eis.email_scraper.IndeedJobScraper") as mock_scraper_class:
1136 # Create mock instance
1137 mock_scraper_instance = MockIndeedJobScraper(INDEED_JOB_IDS)
1138 mock_scraper_class.return_value = mock_scraper_instance
1140 # Call the method we're testing
1141 gmail_scraper._scrape_remaining_jobs(session, test_service_logs[0], {})
1143 # Verify all jobs are now scraped
1144 unscraped_jobs_after = session.query(ScrapedJob).filter().all()
1145 for job in unscraped_jobs_after:
1146 assert job.is_scraped
1147 assert job.scrape_error is None
1149 def test_indeed_nobrightapi_success(
1150 self,
1151 indeed_scraped_jobs,
1152 test_service_logs,
1153 gmail_scraper_with_brightapi_skip,
1154 session,
1155 ) -> None:
1156 """Test successful processing of Indeed email jobs"""
1158 with patch("app.eis.email_scraper.IndeedJobScraper") as mock_scraper_class:
1159 # Create mock instance
1160 mock_scraper_instance = MockIndeedJobScraper(INDEED_JOB_IDS)
1161 mock_scraper_class.return_value = mock_scraper_instance
1163 # Call the method we're testing
1164 jobs = extract_indeed_jobs_from_email(indeed_scraped_jobs[0].emails[0].body)
1165 job_data = {}
1166 for job in jobs:
1167 job_ids = gmail_scraper_with_brightapi_skip.extract_indeed_job_ids(job["job"]["url"])
1168 if job_ids: # Make sure we have at least one job ID
1169 job_data[job_ids[0]] = job
1170 gmail_scraper_with_brightapi_skip._scrape_remaining_jobs(session, test_service_logs[0], job_data)
1172 # Verify all jobs are now scraped
1173 jobs_after = session.query(ScrapedJob).filter().all()
1174 for job in jobs_after:
1175 assert job.is_scraped
1176 assert not job.is_failed
1178 def test_indeed_nobrightapi_fail(
1179 self,
1180 indeed_scraped_jobs,
1181 test_service_logs,
1182 gmail_scraper_with_brightapi_skip,
1183 session,
1184 ) -> None:
1185 """Test successful processing of Indeed email jobs"""
1187 with patch("app.eis.email_scraper.IndeedJobScraper") as mock_scraper_class:
1188 # Create mock instance
1189 mock_scraper_instance = MockIndeedJobScraper(INDEED_JOB_IDS)
1190 mock_scraper_class.return_value = mock_scraper_instance
1192 # Call the method we're testing
1193 gmail_scraper_with_brightapi_skip._scrape_remaining_jobs(session, test_service_logs[0], {})
1195 # Verify all jobs are now scraped
1196 jobs_after = session.query(ScrapedJob).filter().all()
1197 for job in jobs_after:
1198 assert job.is_scraped
1199 assert job.is_failed
1201 def test_linkedin_success(
1202 self,
1203 linkedin_scraped_jobs,
1204 test_service_logs,
1205 gmail_scraper,
1206 session,
1207 ) -> None:
1208 """Test successful processing of Indeed email jobs"""
1210 with patch("app.eis.email_scraper.LinkedinJobScraper") as mock_scraper_class:
1211 # Create mock instance
1212 mock_scraper_instance = MockLinkedinJobScraper(INDEED_JOB_IDS)
1213 mock_scraper_class.return_value = mock_scraper_instance
1215 # Call the method we're testing
1216 gmail_scraper._scrape_remaining_jobs(session, test_service_logs[0], {})
1218 # Verify all jobs are now scraped
1219 jobs_after = session.query(ScrapedJob).filter().all()
1220 for job in jobs_after:
1221 assert job.is_scraped
1222 assert not job.is_failed
1224 def test_indeed_multiple_users_shared_jobs_success(
1225 self,
1226 indeed_scraped_jobs,
1227 indeed_scraped_jobs_user2,
1228 test_service_logs,
1229 gmail_scraper,
1230 session,
1231 ) -> None:
1232 """Test successful processing of Indeed email jobs"""
1233 from unittest.mock import patch, MagicMock
1235 with patch("app.eis.email_scraper.IndeedJobScraper") as mock_scraper_class:
1236 # Create mock instance
1237 mock_scraper_instance = MockIndeedJobScraper(INDEED_JOB_IDS)
1239 # Wrap the scrape_job method with a MagicMock to track calls
1240 original_scrape_job = mock_scraper_instance.scrape_job
1241 mock_scraper_instance.scrape_job = MagicMock(side_effect=original_scrape_job)
1243 mock_scraper_class.return_value = mock_scraper_instance
1245 # Call the method we're testing
1246 gmail_scraper._scrape_remaining_jobs(session, test_service_logs[0], {})
1248 # Verify all jobs are now scraped
1249 jobs_after = session.query(ScrapedJob).filter().all()
1250 assert len(jobs_after) == len(indeed_scraped_jobs) + len(indeed_scraped_jobs_user2)
1251 for job in jobs_after:
1252 assert job.is_scraped
1253 assert not job.is_failed
1255 # Count how many times scrape_job() was called
1256 scrape_job_call_count = mock_scraper_instance.scrape_job.call_count
1257 assert scrape_job_call_count == len(indeed_scraped_jobs)