Coverage for backend/tests/eis/test_email_scraper.py: 99%

460 statements  

« prev     ^ index     » next       coverage.py v7.10.7, created at 2025-09-22 15:38 +0000

1"""Test module for email_parser.py functions and GmailScraper class""" 

2 

3import datetime 

4from unittest.mock import MagicMock, patch 

5 

6import pytest 

7 

8from app.eis import schemas 

9from app.eis.email_scraper import clean_email_address, get_user_id_from_email, GmailScraper 

10from app.eis.job_scraper import extract_indeed_jobs_from_email 

11from app.eis.models import JobAlertEmail, ScrapedJob 

12from tests.conftest import open_file 

13from tests.eis.test_job_scraper import MockLinkedinJobScraper, MockIndeedJobScraper 

14 

15 

16# ------------------------------------------------------ FIXTURES ------------------------------------------------------ 

17 

18 

19def create_gmail_scraper(**kwargs) -> GmailScraper: 

20 """Create a GmailScraper instance for testing with mocked file dependencies 

21 :param kwargs: keyword arguments passed to the GmailScraper constructor""" 

22 

23 with ( 

24 patch("builtins.open", create=True), 

25 patch("json.load") as mock_json_load, 

26 patch("os.path.exists") as mock_exists, 

27 patch("pickle.load"), 

28 patch("pickle.dump"), 

29 patch("app.eis.email_scraper.build") as mock_build, 

30 ): 

31 

32 # Mock the secrets file reading 

33 mock_json_load.return_value = { 

34 "google_auth": { 

35 "installed": { 

36 "client_id": "test_client_id.apps.googleusercontent.com", 

37 "project_id": "test-project", 

38 "auth_uri": "https://accounts.google.com/o/oauth2/auth", 

39 "token_uri": "https://oauth2.googleapis.com/token", 

40 "auth_provider_x509_cert_url": "https://www.googleapis.com/oauth2/v1/certs", 

41 "client_secret": "test_client_secret", 

42 "redirect_uris": ["http://localhost"], 

43 } 

44 } 

45 } 

46 

47 # Mock token file doesn't exist (fresh authentication) 

48 mock_exists.return_value = False 

49 

50 # Mock Gmail service 

51 mock_service = MagicMock() 

52 mock_build.return_value = mock_service 

53 

54 # Mock the OAuth flow 

55 with patch("google_auth_oauthlib.flow.InstalledAppFlow.from_client_config") as mock_flow: 

56 mock_credentials = MagicMock() 

57 mock_credentials.valid = True 

58 mock_flow_instance = MagicMock() 

59 mock_flow_instance.run_local_server.return_value = mock_credentials 

60 mock_flow.return_value = mock_flow_instance 

61 

62 # Create scraper with mocked dependencies 

63 scraper = GmailScraper(secrets_file="test_secrets.json", token_file="test_token.json", **kwargs) 

64 

65 return scraper 

66 

67 

68def mock_get_indeed_redirected_url(url: str) -> str: 

69 """Mock function to replace get_indeed_redirected_url""" 

70 

71 conv = { 

72 "https://uk.indeed.com/pagead/clk/dl?mo=r&ad=-6NYlbfkN0CaUNNDciQjB8b911OChydWlMiE438Jot_lydiWr9Z7lbj9cwyJAEEXhSuW8SoD7Wz1bcqpb5rq8IzPxIcuirUCwOlLSL9SL1F572G6Ye9pXIlV00tsAM20VfzF1b86kTFEpwUl5cqoBjsMlRudbS30FMebfIGC01chUG_dRw15uQJAniZZ9m2OwXKNijACF8VWjBKulQ_zZI6qbz8kD41WGqtaC6lMPRCw5kXUrJbTDCaqSpugfThHENgjlu3j5DBWMjvzWpApXtcxY1NTDKT2jg6q-Z5ZkxpZFWJpPicGjeEfETjD8De3kM__AclzfTjESmozVOJMXW85h3mgPZ94GIuFEx8ppqwDwLENrDoalprKNGMFQOeZ9u9dMbxUX_RJCqW9z1vgoP6UivsqTanzYlukGXOhEQ6IFVnNvDODivSUcZCpO_yBMmxlJxaYuRjPQmnuvS8CFyF8B-M_msQscB4GMRxaiGJuzie7_iJr6nKUP2O7lo1n69wInEp_MnehsLtxzcDysc6eBzfF4v2KkuXm1RRPbFqeIA7TK2sPoy2Z8b3VGKVcWv8k90XwuftkqxlnbbXeP3t1ygWiIMHdoJNVKkxUu46MZXtM498k9txG9p9ByQhDcOI8_BRoVsP3DM1wQl1ang-WkAVoo2PTwmdtETp3VlZZuUfSGtYYEdj-E9JmOVulmnyjbLfssmM%3D&xkcb=SoB56_M3u5Oxdj0MCJ0ObzkdCdPP&camk=UoKtGZLa3XLCRNJifgWECQ%3D%3D&p=0&jsa=1997&rjs=1&tmtk=1j3p3fhn5gc8r800&gdfvj=1&alid=672a6c661e474561bc946956&fvj=1&g1tAS=true": "06498cad9de95b12", 

73 "https://uk.indeed.com/pagead/clk/dl?mo=r&ad=-6NYlbfkN0D_vIW1HWJamhhVblwSY9vEnB3YehQDBaLQWgEpQbAFvEB66TXnGDud1dy-8adNNEA8NkJwfd77g5zBB1ZOXhf8PEjWP1V1-Zs6swoSDNPKB4lvVzHxu1T3qM7FYs12eUEkiIA-iiINRZ_P2VMyvYooQezlTWytMkd2UWxnVCG9a3_m1cyaMA7DTm_syy5wCWCpCUUvgVdIOEOARvgAhUnIIz9x2Chk3LMqtby4HJFP4Jl7C-Vi5YB8H0bSA1FeugROif2FHIwU9gEobz-VsFvEz_Z4cCH3oft61BFqWCWU_wWimKzWAcDGINsjLw9tAunN_xjEdupF33Iwcd77c1urVC1OLKbL3-o2oJRyEPfNL1YN7H5cP_VieI3Fir6psGrVHQv_bNy0yYleEmT0E_DofaYunYAnzMqD_SUhvCDHia8MqrGJkTcgJp16KsMZPr5_mVLck5-3PYB-3khV71Oqfoa7q1yRWl-SN-Qfwc2OdZ8zl9PsK42-6iQ34faa2uibd37I4QFVw_Rwx7r8W-xyXpiwfe4xmkhhRGK1DeiQibftk7Dyp41hCpPZbTW_bL5F98fT1mfh1u1enhw3sXxk_BjcAXS_HZpuWi5zMuwbIztF4a8ZtEo_fNdlevRIwrN-0-0qjuEDoJYSxnY3mvd2WkDit7XyYAQWaCBCtSOLVSvgSDi4pd033dZ1KPZD7a0uFkrEyWaWSQ%3D%3D&xkcb=SoBQ6_M3u5Oxdj0MCJ0MbzkdCdPP&camk=ethIe0s0hedS-FZyNnahJA%3D%3D&p=0&jsa=1997&rjs=1&tmtk=1j3p3fhn5gc8r800&gdfvj=1&alid=672a6c661e474561bc946956&fvj=1&g1tAS=true": "42b107e214095d56", 

74 "https://uk.indeed.com/pagead/clk/dl?mo=r&ad=-6NYlbfkN0Cf-siO93BSuJ_a-mQFMzVvPBmFGGJg8IeoYoU7n3Hr-wyttwxtthbeGbpHFYWwmmWPWQtznc_slvzvpsaBmSWUWC64QSSNhEuwuNUWHSLtah1bwBpWniJ8vAR5oqbmqlY296quUSNSViPhje6fSFgDWLhGJWLOZaQ6OJRAp-V8a91no5GJKrUzj_KWnmJKR4rz_W6vZS8NYU5v9qDqx0uOlGmg1BnkC5lIZzyqlYwwOiZdPPVaEKKEr_G0GeQvlH67sGm1xTNyJw8sK6-4jN_ENAf2kd7JTexBVkGw5Mo02tAYXFvdA29R0CGRR0lyQRZtFJjgkhZvLHHLYO8JNjy_mia4G2BQ7Sx4ktyjaStia3kR4-BQNNWnr3k3ocyacfQEMHQlqE-Boaf4mwI0-BtJXesJsw9bvP207NBnfZFLJs1hUmSgvHhdYukY2qIsWXJLUVJgOyjwxdLhap0eFBEyti7g0G0mb3e1eO9ATdBP_e0h_p932Dm6wVyAZEXOddagVLoHFiJWPYnq8BUyKvm_S3vp9I57lYRrxWVTKZve2VIP18Uex6Bz0SozYOEEdgfyqQMBRAcp935Hg8aUW8GrXb3Q-js8GxuFke_S_tiEhCyNOEMjhQ-VRl5QOPdFttLD6e9-WR_H8IFLZUu3KwcfMBy1qEq1Tio%3D&xkcb=SoAk6_M3u5Oxdj0MCJ0AbzkdCdPP&camk=ethIe0s0hedep5fbP4CFtg%3D%3D&p=0&jsa=1997&rjs=1&tmtk=1j3p3fhn5gc8r800&gdfvj=1&alid=672a6c661e474561bc946956&fvj=1&g1tAS=true": "14a9001ba6ebb965", 

75 "https://uk.indeed.com/pagead/clk/dl?mo=r&ad=-6NYlbfkN0COSBp8KgMXxewvi58QAG0wwdlVlJfveGrD5vFIguWoXakoblclqS-4T_znVTPKawHOSHZOxsl_jK0JZuGPspNA9roT-uonvDv2P6RZVLNvLfm0KdPGmVMWwoNgo5H64KiIVwOuf_UrhuMQzHBJIgwJkroSRqxeEQ_3FKwvys8bTaQ85PMumf55yR90-LeyTGL3GXnHmXVXSfC1MDn6qf5BpprmfFM-RGc2WNblsNn6hNEtF-n7NfrAi-f-PzOE_Fjwhx-Y50MEMdlex_3U6MgwFpw7CADiD1Fch2HOI_bhNgCdt6qoLUO2qEA1AX1Ax0_pwn33z2XS_4FOGRcb4ZGqTii1rx-Elj6c6n-95wiR2sks-xrI0uMrPaE2w8P5k5v6tx1ixIQT9liqyzcXoSS6vzmARulIHV4NUWn0e_K4EvX-A-zYBjcEGSGUrLelauCc21fXrDww_gNV_ZSmedh1M06WDaPc3K_6WYtv6-_kkYQhQJyLlyW0Ws23VNL5nfJygGuW8pXeZhbniMlcDaavPtyGoDp4EWGOAI45uMzcbnJ0UyZcRPmuQxfCD8cFz-lmNle1TxlSWFB7j5QOAIn1UbXcKS7gdbhBijiUJWdSdzfbaPNHZdIPMBs6CDUZT5dPrhj_mtNopw4DVvv-OUOAzOpx9mlyJpr5aE7ivabt7_V3CMtJpw7ieYZ4UBA5ZQQ%3D&xkcb=SoCq6_M3u5Oxdj0MCJ0HbzkdCdPP&camk=UoKtGZLa3XL6dp7SxnkD1A%3D%3D&p=0&jsa=1997&rjs=1&tmtk=1j3p3fhn5gc8r800&gdfvj=1&alid=672a6c661e474561bc946956&fvj=1&g1tAS=true": "eafb032fabcd77bc", 

76 "https://uk.indeed.com/pagead/clk/dl?mo=r&ad=-6NYlbfkN0DUGxYnv6px9uI6dWZhSaSeqMgHWZda7534TRDDAqMKu87sK88i_2Gbq8z1VBS-lbE9HOACaDVAT4jwhaVY_xabO_rq24Y_veJqW-7_usP-_0tRugSmofb5DuxCq5IvmHBw1rNykLW3A5edDY3v_jFGsNtRR7fiXWfgXBO9BJc6FCnwMo2I8cy9hPyydcFqH8iy9UHGKCJzlwGZAiKzNQyLn0rE_XB9MXJX9itgkAFNjlDq17qpEbAnLeIOJCcDXQ03H-DIxBN3ycBF9r29kZ45spvjQItrgoMklzXH3jPwU2j7qTpqQxKVcw5xKYuIWDhM5YqzbSTzr7Z97yKVWDKaB7gM87UyTYdJ32cflCxws1brYrULvaC8SfbTlTbsHvAdrl7BHnq6r6j_pBdFDKWUW-HcBCMgYk3ikg7sr5qwJAmQMqMjyLYUfWLVQ2ouX79v1awn5CT_sz7DqSikuv7MUgfzGrvbjHnov-zAxQfFPwdSmWZkgIz7UdZVOXCV0M6bw-XkaWtkDrGyiJRLOmEPNiiNwLnsKek3SWBSR8qHNbsrDWHz391rS2onjNWfo5gnmims0O-R-8jgV2J2NQyYP0ZNTYquIehRay6WTLbEZRsxgCy4Pgz42H-Z71EnOTwqnZ-8qLPoJRHV0K9oMQL6&xkcb=SoC36_M3u5Oxdj0MCJ0ebzkdCdPP&camk=ethIe0s0hefv8CfXU2K9Rw%3D%3D&p=0&jsa=1997&rjs=1&tmtk=1j3p3fhn5gc8r800&gdfvj=1&alid=672a6c661e474561bc946956&fvj=1&g1tAS=true": "5aa22054e7a8b76e", 

77 "https://uk.indeed.com/pagead/clk/dl?mo=r&ad=-6NYlbfkN0BqgWWSVbq3rqstnfUzC8xqhdOuKqZ9Avj77mYlc-g-lgy-1FSdO6PyFnAuQRYfp-JTSxMGeZR4wFhLR1UE4XYsePMvv1exKBMkCeCy9Dh-JYDgYqQLDREEwr5Bfy7uoO_og4WXgkp9rnXdiC6ej8lfOCDGtLs0xpRssH8ApFDX2WPI2WZLU3Dr_bYyzL-F51cHyx5ndFwTEKvG8FqgvbkNe1y7DDUUNUQ1EIdLP4bXw1hDuYRjJm9fbGQDc8LmmrzvdE37KxUZqeU3mzGz2moMrdAZPMufhp93UnQ8QmfOD8uq1LGUenfAtLXc7JvOdVmgZkFtGBtdlJ2Dce9Ty8I9XNaZR1vVTXVwfiM9K6yVwKEH5xhUCsr8a3DFXmcVOrivfiMWlzjRM8Bhtnwff6uJ8CLpNr-VdvfAHJTrsflPiwb6FZFX9sKw1kbd-zDyBDq_vEXiJor5MJKcuzQZ2DH62Tgv_dZllHjmGCWfk5775BFywNThFfEpBqM_-8GhAUHBfb6TSXITGIOiwWH6s7fbs7Fhz8wv20YInHAp2vJ--cjK9uVra5jKMPXk8XB1cUTG-ZWtKfzOtVi4TkT5lfFWC12tyMHgv72MFU3YxnXQZrswfP6D5JhZUJM5toctt1AkDeniJsTqR1-JtOeuQaLjQe7KvUV9qJ_ZUXba6qtMvOfz-BCYBDjc&xkcb=SoAq6_M3u5Oxdj0MCJ0dbzkdCdPP&camk=UoKtGZLa3XJTEZOPwEn50w%3D%3D&p=0&jsa=1997&rjs=1&tmtk=1j3p3fhn5gc8r800&gdfvj=1&alid=672a6c661e474561bc946956&fvj=1&g1tAS=true": "ae47862d410bbd39", 

78 } 

79 

80 return "https://uk.indeed.com/rc/clk/dl?jk=" + conv[url] 

81 

82 

83@pytest.fixture(autouse=True) 

84def patch_get_indeed_redirected_url(monkeypatch): 

85 """Automatically patch get_indeed_redirected_url in all tests to avoid real HTTP requests""" 

86 

87 from app.eis.email_scraper import GmailScraper 

88 

89 monkeypatch.setattr(GmailScraper, "get_indeed_redirected_url", mock_get_indeed_redirected_url) 

90 

91 

92@pytest.fixture 

93def gmail_scraper() -> GmailScraper: 

94 """Create a GmailScraper instance for testing with mocked file dependencies.""" 

95 

96 return create_gmail_scraper(skip_indeed_brightapi_scraping=False) 

97 

98 

99@pytest.fixture 

100def gmail_scraper_with_brightapi_skip() -> GmailScraper: 

101 """Create a GmailScraper instance with BrightAPI skip enabled.""" 

102 

103 return create_gmail_scraper(skip_indeed_brightapi_scraping=True) 

104 

105 

106def create_email_data( 

107 test_users, 

108 filename: str, 

109 platform: str, 

110 user_index: int, 

111) -> schemas.JobAlertEmailCreate: 

112 """Create a JobAlertEmailCreate data for testing 

113 :param test_users: test users 

114 :param filename: file name 

115 :param platform: platform name 

116 :param user_index: user index""" 

117 

118 ofile = open_file(f"{filename}.txt") 

119 return schemas.JobAlertEmailCreate( 

120 external_email_id=f"{filename}_{platform}_{user_index}", 

121 subject="Subject", 

122 sender=test_users[user_index].email, 

123 date_received=datetime.datetime.now(), 

124 platform=platform, 

125 body=ofile, 

126 ) 

127 

128 

129# Job ids extracted from the linkedin email body 

130LINKEDIN_JOB_IDS = [ 

131 "4289870503", 

132 "4291891707", 

133 "4291383265", 

134 "4280354992", 

135 "4255584864", 

136 "4265877117", 

137] 

138 

139# Job ids extracted from the indeed email body 

140INDEED_JOB_IDS = [ 

141 "8799a57d87058103", 

142 "d489097ca0fb185f", 

143 "7f9c701ebf265b69", 

144 "0537336f99ba1650", 

145 "312725e138947a4b", 

146 "06498cad9de95b12", 

147 "bd60005166216639", 

148 "42b107e214095d56", 

149 "d30493c008b601e3", 

150 "da413431a0c55ec7", 

151 "2ed37852402643ab", 

152 "14a9001ba6ebb965", 

153 "eafb032fabcd77bc", 

154 "6838e604ddffd5ac", 

155 "227d4ccd0823fc96", 

156 "804b940d2d96b30b", 

157 "f9aafc9ba4c31c6d", 

158 "e034f0b761e410ea", 

159 "37cdb0ba59e12295", 

160 "7b272f46e4e46a14", 

161 "d6110bfb54bdeddb", 

162 "5aa22054e7a8b76e", 

163 "ae47862d410bbd39", 

164] 

165 

166 

167@pytest.fixture 

168def linkedin_email_data(test_users) -> tuple[schemas.JobAlertEmailCreate, list[str]]: 

169 """Create a LinkedIn job alert email record for testing.""" 

170 

171 return create_email_data(test_users, "linkedin_email", "linkedin", 0), LINKEDIN_JOB_IDS 

172 

173 

174@pytest.fixture 

175def linkedin_email_data_user2(test_users) -> tuple[schemas.JobAlertEmailCreate, list[str]]: 

176 """Create a LinkedIn job alert email record for testing.""" 

177 

178 return create_email_data(test_users, "linkedin_email", "linkedin", 1), LINKEDIN_JOB_IDS 

179 

180 

181@pytest.fixture 

182def indeed_email_data(test_users) -> tuple[schemas.JobAlertEmailCreate, list[str]]: 

183 """Create an Indeed job alert email record for testing.""" 

184 

185 return create_email_data(test_users, "indeed_email", "indeed", 0), INDEED_JOB_IDS 

186 

187 

188@pytest.fixture 

189def indeed_email_data_user2(session, test_users) -> tuple[schemas.JobAlertEmailCreate, list[str]]: 

190 """Create an Indeed job alert email record for testing.""" 

191 

192 return create_email_data(test_users, "indeed_email", "indeed", 1), INDEED_JOB_IDS 

193 

194 

195def create_email_record(session, test_users, filename: str, platform: str, user_index: int) -> JobAlertEmail: 

196 """Create a ScrapedJob record for testing. 

197 :param session: database session 

198 :param test_users: test users 

199 :param filename: file name 

200 :param platform: platform name 

201 :param user_index: user index""" 

202 

203 email_data = create_email_data(test_users, filename, platform, user_index) 

204 # noinspection PyArgumentList 

205 email_record = JobAlertEmail(**email_data.model_dump(), owner_id=test_users[user_index].id) 

206 session.add(email_record) 

207 session.commit() 

208 return email_record 

209 

210 

211@pytest.fixture 

212def linkedin_email_record(session, test_users): 

213 """Create a LinkedIn job alert email record for testing.""" 

214 

215 return create_email_record(session, test_users, "linkedin_email", "linkedin", 0), LINKEDIN_JOB_IDS 

216 

217 

218@pytest.fixture 

219def linkedin_email_record_user2(session, test_users) -> tuple[schemas.JobAlertEmailCreate, list[str]]: 

220 """Create a LinkedIn job alert email record for testing.""" 

221 

222 return create_email_record(session, test_users, "linkedin_email", "linkedin", 1), LINKEDIN_JOB_IDS 

223 

224 

225@pytest.fixture 

226def indeed_email_record(session, test_users) -> tuple[schemas.JobAlertEmailCreate, list[str]]: 

227 """Create an Indeed job alert email record for testing.""" 

228 

229 return create_email_record(session, test_users, "indeed_email", "indeed", 0), INDEED_JOB_IDS 

230 

231 

232@pytest.fixture 

233def indeed_email_record_user2(session, test_users) -> tuple[schemas.JobAlertEmailCreate, list[str]]: 

234 """Create an Indeed job alert email record for testing.""" 

235 

236 return create_email_record(session, test_users, "indeed_email", "indeed", 1), INDEED_JOB_IDS 

237 

238 

239# --------------------------------------------------- BASE FUNCTIONS --------------------------------------------------- 

240 

241 

242class TestCleanEmailAddress: 

243 """Test class for clean_email_address function""" 

244 

245 @pytest.mark.parametrize( 

246 "sender_field,expected", 

247 [ 

248 ("John Doe <john.doe@gmail.com>", "john.doe@gmail.com"), 

249 ("john.doe@gmail.com", "john.doe@gmail.com"), 

250 ('"John Doe" <john.doe@gmail.com>', "john.doe@gmail.com"), 

251 ("Test User <TEST.USER@EXAMPLE.COM>", "test.user@example.com"), 

252 (" test@example.com ", "test@example.com"), 

253 ("Invalid Format", "invalid"), 

254 ("Jane Smith <jane.smith+tag@company.co.uk>", "jane.smith+tag@company.co.uk"), 

255 ("Multiple Words Name <multi.word@domain.org>", "multi.word@domain.org"), 

256 ], 

257 ) 

258 def test_clean_email_address(self, sender_field, expected) -> None: 

259 """Test email address cleaning with various formats""" 

260 

261 result = clean_email_address(sender_field) 

262 assert result == expected 

263 

264 

265class TestGetUserIdFromEmail: 

266 """Test class for get_user_id_from_email function""" 

267 

268 def test_get_user_id_existing_user(self, session, test_users) -> None: 

269 """Test getting user ID for existing user""" 

270 

271 test_user = test_users[0] 

272 result = get_user_id_from_email(test_user.email, session) 

273 assert result == test_user.id 

274 

275 def test_get_user_id_non_existing_user(self, session) -> None: 

276 """Test getting user ID for non-existing user returns default ID 1""" 

277 

278 with pytest.raises(AssertionError): 

279 get_user_id_from_email("nonexistent@example.com", session) 

280 

281 def test_get_user_id_empty_email(self, session) -> None: 

282 """Test getting user ID with empty email""" 

283 

284 with pytest.raises(AssertionError): 

285 get_user_id_from_email("", session) 

286 

287 def test_get_user_id_case_sensitivity(self, session, test_users) -> None: 

288 """Test that email lookup is case-sensitive (as per database collation)""" 

289 

290 test_user = test_users[0] 

291 upper_email = test_user.email.upper() 

292 with pytest.raises(AssertionError): 

293 get_user_id_from_email(upper_email, session) 

294 

295 

296# --------------------------------------------- GMAILSCRAPER STATIC METHODS -------------------------------------------- 

297 

298 

299class TestSaveEmailToDb: 

300 """Test class for GmailScraper.save_email_to_db method""" 

301 

302 def test_save_new_email_success(self, linkedin_email_data, test_service_logs, session, test_users) -> None: 

303 """Test saving a new email successfully""" 

304 

305 linkedin_email_data = linkedin_email_data[0] 

306 result_email, is_created = GmailScraper.save_email_to_db(linkedin_email_data, test_service_logs[0].id, session) 

307 

308 assert is_created is True 

309 assert result_email.external_email_id == linkedin_email_data.external_email_id 

310 assert result_email.subject == linkedin_email_data.subject 

311 assert result_email.sender == linkedin_email_data.sender 

312 assert result_email.platform == linkedin_email_data.platform 

313 assert result_email.body == linkedin_email_data.body 

314 assert result_email.owner_id == test_users[0].id 

315 assert result_email.service_log_id == test_service_logs[0].id 

316 

317 # Verify it's actually in the database 

318 # noinspection PyTypeChecker 

319 db_email = ( 

320 session.query(JobAlertEmail) 

321 .filter(JobAlertEmail.external_email_id == linkedin_email_data.external_email_id) 

322 .first() 

323 ) 

324 assert db_email is not None 

325 assert db_email.id == result_email.id 

326 

327 def test_save_existing_email_returns_existing( 

328 self, linkedin_email_data, test_service_logs, session, test_users 

329 ) -> None: 

330 """Test that existing email is returned without creating a new record""" 

331 

332 # noinspection PyArgumentList 

333 existing_email = JobAlertEmail( 

334 external_email_id=linkedin_email_data[0].external_email_id, 

335 subject="Different Subject", 

336 sender="different@example.com", 

337 owner_id=test_users[0].id, 

338 service_log_id=test_service_logs[0].id, 

339 ) 

340 session.add(existing_email) 

341 session.commit() 

342 

343 result_email, is_created = GmailScraper.save_email_to_db( 

344 linkedin_email_data[0], test_service_logs[0].id, session 

345 ) 

346 

347 assert is_created is False 

348 assert result_email.id == existing_email.id 

349 assert result_email.subject == "Different Subject" # Original data preserved 

350 

351 # Verify only one record exists 

352 # noinspection PyTypeChecker 

353 email_count = ( 

354 session.query(JobAlertEmail) 

355 .filter(JobAlertEmail.external_email_id == linkedin_email_data[0].external_email_id) 

356 .count() 

357 ) 

358 assert email_count == 1 

359 

360 

361class TestExtractLinkedinJobIds: 

362 """Test class for GmailScraper.extract_linkedin_job_ids method""" 

363 

364 def test_extract_linkedin_job_ids_real_email(self, linkedin_email_data) -> None: 

365 """Test extracting LinkedIn job IDs from real LinkedIn email content""" 

366 

367 job_ids = GmailScraper.extract_linkedin_job_ids(linkedin_email_data[0].body) 

368 

369 assert len(job_ids) == 6 

370 assert job_ids == LINKEDIN_JOB_IDS 

371 

372 def test_extract_linkedin_job_ids_empty_body(self) -> None: 

373 """Test extracting job IDs from empty body""" 

374 

375 job_ids = GmailScraper.extract_linkedin_job_ids("") 

376 assert job_ids == [] 

377 

378 def test_extract_linkedin_job_ids_no_jobs(self) -> None: 

379 """Test extracting job IDs from body with no LinkedIn job URLs""" 

380 

381 body = """ 

382 This is a test email with no LinkedIn job URLs. 

383 It contains some other URLs like: 

384 - https://www.google.com 

385 - https://www.example.com 

386 - https://www.linkedin.com/profile/some-user 

387 But no job view URLs. 

388 """ 

389 

390 job_ids = GmailScraper.extract_linkedin_job_ids(body) 

391 assert job_ids == [] 

392 

393 @pytest.mark.parametrize( 

394 "url_pattern,expected_id", 

395 [ 

396 ("https://www.linkedin.com/jobs/view/1234567890", "1234567890"), 

397 ("https://www.linkedin.com/comm/jobs/view/9876543210", "9876543210"), 

398 ("HTTPS://WWW.LINKEDIN.COM/JOBS/VIEW/5555555555", "5555555555"), 

399 ("https://linkedin.com/jobs/view/1111111111", "1111111111"), 

400 ("http://www.linkedin.com/jobs/view/2222222222", "2222222222"), 

401 ], 

402 ) 

403 def test_extract_linkedin_job_ids_url_variations(self, url_pattern, expected_id) -> None: 

404 """Test extracting job IDs from various URL patterns""" 

405 

406 body = f"Check out this job: {url_pattern}" 

407 

408 job_ids = GmailScraper.extract_linkedin_job_ids(body) 

409 

410 assert len(job_ids) == 1 

411 assert job_ids[0] == expected_id 

412 

413 def test_extract_linkedin_job_ids_with_duplicate_ids(self) -> None: 

414 """Test that duplicate job IDs are removed""" 

415 

416 body = """ 

417 Job 1: https://www.linkedin.com/jobs/view/1111111111 

418 Job 2: https://www.linkedin.com/jobs/view/2222222222  

419 Job 3: https://www.linkedin.com/jobs/view/1111111111 

420 Job 4: https://www.linkedin.com/jobs/view/3333333333 

421 Job 5: https://www.linkedin.com/jobs/view/2222222222 

422 """ 

423 

424 job_ids = GmailScraper.extract_linkedin_job_ids(body) 

425 

426 assert len(job_ids) == 3 

427 assert job_ids == ["1111111111", "2222222222", "3333333333"] 

428 

429 def test_extract_linkedin_job_ids_with_query_parameters(self) -> None: 

430 """Test extracting job IDs from URLs with query parameters (like the real email)""" 

431 

432 body = """ 

433 View job: https://www.linkedin.com/comm/jobs/view/4289870503/?trackingId=tt9C%2FzqOXzxRyy9uU5vDOw%3D%3D&refId=something 

434 Another job: https://www.linkedin.com/jobs/view/1234567890?ref=email&source=alert 

435 """ 

436 

437 job_ids = GmailScraper.extract_linkedin_job_ids(body) 

438 

439 assert len(job_ids) == 2 

440 assert "4289870503" in job_ids 

441 assert "1234567890" in job_ids 

442 

443 def test_extract_linkedin_job_ids_malformed_urls(self) -> None: 

444 """Test that malformed LinkedIn URLs are ignored""" 

445 

446 body = """ 

447 Good URL: https://www.linkedin.com/jobs/view/1111111111 

448 Malformed: https://www.linkedin.com/jobs/view/ 

449 Malformed: https://www.linkedin.com/jobs/view/abcd 

450 Another good: https://www.linkedin.com/jobs/view/2222222222 

451 """ 

452 

453 job_ids = GmailScraper.extract_linkedin_job_ids(body) 

454 

455 assert len(job_ids) == 2 

456 assert job_ids == ["1111111111", "2222222222"] 

457 

458 

459class TestExtractIndeedJobIds: 

460 """Test class for GmailScraper.extract_indeed_job_ids method""" 

461 

462 def test_extract_indeed_job_ids_real_email(self, indeed_email_data) -> None: 

463 """Test extracting Indeed job IDs from real Indeed email content""" 

464 

465 job_ids = GmailScraper.extract_indeed_job_ids(indeed_email_data[0].body) 

466 assert job_ids == INDEED_JOB_IDS 

467 

468 def test_extract_indeed_job_ids_empty_body(self) -> None: 

469 """Test extracting job IDs from empty body""" 

470 

471 job_ids = GmailScraper.extract_indeed_job_ids("") 

472 assert job_ids == [] 

473 

474 def test_extract_indeed_job_ids_no_jobs(self) -> None: 

475 """Test extracting job IDs from body with no Indeed job URLs""" 

476 

477 body = """ 

478 This is a test email with no Indeed job URLs. 

479 It contains some other URLs like: 

480 - https://www.google.com 

481 - https://www.example.com 

482 - https://www.indeed.com/profile/some-user 

483 But no job view URLs. 

484 """ 

485 

486 job_ids = GmailScraper.extract_indeed_job_ids(body) 

487 assert job_ids == [] 

488 

489 @pytest.mark.parametrize( 

490 "url_pattern,expected_id", 

491 [ 

492 ("https://uk.indeed.com/rc/clk/dl?jk=1234567890abcdef&from=ja", "1234567890abcdef"), 

493 ("HTTPS://UK.INDEED.COM/RC/CLK/DL?JK=5555555555AAAA&FROM=JA", "5555555555AAAA"), 

494 ("http://indeed.com/rc/clk/dl?jk=1111111111bbbb&other=param", "1111111111bbbb"), 

495 ], 

496 ) 

497 def test_extract_indeed_job_ids_url_variations(self, url_pattern, expected_id) -> None: 

498 """Test extracting job IDs from various URL patterns""" 

499 

500 body = f"Check out this job: {url_pattern}" 

501 

502 job_ids = GmailScraper.extract_indeed_job_ids(body) 

503 

504 assert len(job_ids) == 1 

505 assert job_ids[0] == expected_id 

506 

507 def test_extract_indeed_job_ids_with_duplicate_ids(self) -> None: 

508 """Test that duplicate job IDs are removed""" 

509 

510 body = """ 

511 Job 1: https://uk.indeed.com/rc/clk/dl?jk=1111111111aaa&from=ja 

512 Job 2: https://uk.indeed.com/rc/clk/dl?jk=2222222222bbb&from=ja 

513 Job 3: https://uk.indeed.com/rc/clk/dl?jk=1111111111aaa&from=ja 

514 Job 4: https://uk.indeed.com/rc/clk/dl?jk=3333333333ccc&from=ja 

515 Job 5: https://uk.indeed.com/rc/clk/dl?jk=2222222222bbb&from=ja 

516 """ 

517 

518 job_ids = GmailScraper.extract_indeed_job_ids(body) 

519 

520 assert len(job_ids) == 3 

521 assert job_ids == ["1111111111aaa", "2222222222bbb", "3333333333ccc"] 

522 

523 def test_extract_indeed_job_ids_malformed_urls(self) -> None: 

524 """Test that malformed Indeed URLs are ignored""" 

525 

526 body = """ 

527 Good URL: https://uk.indeed.com/rc/clk/dl?jk=1111111111aaa&from=ja 

528 Malformed: https://uk.indeed.com/rc/clk/dl?from=ja 

529 Malformed: https://uk.indeed.com/rc/clk/dl?jk= 

530 Another good: https://uk.indeed.com/rc/clk/dl?jk=2222222222bbb&from=ja 

531 """ 

532 

533 job_ids = GmailScraper.extract_indeed_job_ids(body) 

534 

535 assert len(job_ids) == 2 

536 assert job_ids == ["1111111111aaa", "2222222222bbb"] 

537 

538 

539class TestSaveJobsToDb: 

540 """Test class for GmailScraper.save_jobs_to_db method""" 

541 

542 def test_save_new_jobs_success(self, test_job_alert_emails, session, test_users) -> None: 

543 """Test saving new job IDs successfully""" 

544 

545 job_ids = ["job_123", "job_456", "job_789"] 

546 

547 result = GmailScraper.save_jobs_to_db(email_record=test_job_alert_emails[0], job_ids=job_ids, db=session) 

548 

549 # Verify returned list has correct length 

550 assert len(result) == 3 

551 

552 # Verify all jobs are ScrapedJob instances 

553 for job_record in result: 

554 assert job_record.owner_id == test_users[0].id 

555 assert job_record.external_job_id in job_ids 

556 assert test_job_alert_emails[0] in job_record.emails 

557 

558 def test_save_existing_jobs_returns_existing(self, test_job_alert_emails, session, test_users) -> None: 

559 """Test that existing jobs are returned without creating duplicates""" 

560 

561 # Create existing jobs 

562 # noinspection PyArgumentList 

563 existing_job = ScrapedJob(external_job_id="existing_job_123", owner_id=test_users[0].id) 

564 session.add(existing_job) 

565 session.commit() 

566 session.refresh(existing_job) 

567 

568 job_ids = ["existing_job_123", "new_job_456"] 

569 

570 result = GmailScraper.save_jobs_to_db(email_record=test_job_alert_emails[0], job_ids=job_ids, db=session) 

571 

572 # Verify returned list has correct length 

573 assert len(result) == 2 

574 

575 def test_save_jobs_different_owners(self, test_job_alert_emails, session, test_users) -> None: 

576 """Test that jobs with same external_job_id but different owners are created separately""" 

577 

578 assert test_job_alert_emails[0].owner_id != test_job_alert_emails[-1].owner_id 

579 

580 # Save same job ID for both users 

581 job_ids = ["same_job_123"] 

582 

583 result_1 = GmailScraper.save_jobs_to_db(email_record=test_job_alert_emails[0], job_ids=job_ids, db=session) 

584 

585 result_2 = GmailScraper.save_jobs_to_db(email_record=test_job_alert_emails[-1], job_ids=job_ids, db=session) 

586 

587 # Verify separate job records were created for each owner 

588 assert len(result_1) == 1 

589 assert len(result_2) == 1 

590 assert result_1[0].id != result_2[0].id 

591 assert result_1[0].owner_id == test_users[0].id 

592 assert result_2[0].owner_id == test_users[1].id 

593 

594 # Verify both have the same external job ID 

595 assert result_1[0].external_job_id == "same_job_123" 

596 assert result_2[0].external_job_id == "same_job_123" 

597 

598 # Verify total count in the database 

599 total_jobs = session.query(ScrapedJob).count() 

600 assert total_jobs == 2 

601 

602 

603class TestSaveJobDataToDb: 

604 """Test class for GmailScraper.save_job_data_to_db method""" 

605 

606 @pytest.fixture 

607 def sample_job_data(self) -> dict: 

608 """Sample job data in the expected format""" 

609 

610 return { 

611 "company": "Test Company Ltd", 

612 "location": "London, UK", 

613 "job": { 

614 "title": "Senior Software Engineer", 

615 "description": "We are looking for a senior software engineer to join our team...", 

616 "url": "https://example.com/job/123", 

617 "salary": {"min_amount": 50000.0, "max_amount": 70000.0}, 

618 }, 

619 } 

620 

621 @pytest.fixture 

622 def sample_scraped_job(self, session, test_users) -> ScrapedJob: 

623 """Create a sample scraped job record""" 

624 

625 # noinspection PyArgumentList 

626 job = ScrapedJob( 

627 external_job_id="test_job_123", 

628 owner_id=test_users[0].id, 

629 ) 

630 session.add(job) 

631 session.commit() 

632 session.refresh(job) 

633 return job 

634 

635 def test_save_job_data_single_job_and_data(self, sample_scraped_job, sample_job_data, session) -> None: 

636 """Test saving job data to a single job record""" 

637 

638 # Verify initial state 

639 assert sample_scraped_job.is_scraped is False 

640 assert sample_scraped_job.title is None 

641 assert sample_scraped_job.company is None 

642 

643 # Save job data 

644 GmailScraper.save_job_data_to_db( 

645 job_records=sample_scraped_job, job_data=sample_job_data, db=session, scraped_date=datetime.datetime.now() 

646 ) 

647 

648 # Refresh the record from database 

649 session.refresh(sample_scraped_job) 

650 

651 # Verify the data was saved correctly 

652 assert sample_scraped_job.is_scraped is True 

653 assert sample_scraped_job.company == sample_job_data["company"] 

654 assert sample_scraped_job.location == sample_job_data["location"] 

655 assert sample_scraped_job.title == sample_job_data["job"]["title"] 

656 assert sample_scraped_job.description == sample_job_data["job"]["description"] 

657 assert sample_scraped_job.url == sample_job_data["job"]["url"] 

658 assert sample_scraped_job.salary_min == sample_job_data["job"]["salary"]["min_amount"] 

659 assert sample_scraped_job.salary_max == sample_job_data["job"]["salary"]["max_amount"] 

660 

661 def test_save_job_data_multiple_jobs_and_data(self, session, test_users) -> None: 

662 """Test saving job data to multiple job records""" 

663 

664 # Create multiple job records 

665 # noinspection PyArgumentList 

666 job_1 = ScrapedJob( 

667 external_job_id="job_1", 

668 owner_id=test_users[0].id, 

669 is_scraped=False, 

670 ) 

671 # noinspection PyArgumentList 

672 job_2 = ScrapedJob( 

673 external_job_id="job_2", 

674 owner_id=test_users[0].id, 

675 is_scraped=False, 

676 ) 

677 session.add_all([job_1, job_2]) 

678 session.commit() 

679 session.refresh(job_1) 

680 session.refresh(job_2) 

681 

682 # Create multiple job data entries 

683 job_data_1 = { 

684 "company": "Company A", 

685 "location": "London, UK", 

686 "job": { 

687 "title": "Developer A", 

688 "description": "Description A", 

689 "url": "https://example.com/job/a", 

690 "salary": {"min_amount": 40000.0, "max_amount": 60000.0}, 

691 }, 

692 } 

693 

694 job_data_2 = { 

695 "company": "Company B", 

696 "location": "Manchester, UK", 

697 "job": { 

698 "title": "Developer B", 

699 "description": "Description B", 

700 "url": "https://example.com/job/b", 

701 "salary": {"min_amount": 45000.0, "max_amount": 65000.0}, 

702 }, 

703 } 

704 

705 # Save job data 

706 GmailScraper.save_job_data_to_db( 

707 job_records=[job_1, job_2], 

708 job_data=[job_data_1, job_data_2], 

709 db=session, 

710 scraped_date=datetime.datetime.now(), 

711 ) 

712 

713 # Refresh records 

714 session.refresh(job_1) 

715 session.refresh(job_2) 

716 

717 # Verify first job 

718 assert job_1.is_scraped is True 

719 assert job_1.company == "Company A" 

720 assert job_1.title == "Developer A" 

721 assert job_1.salary_min == 40000.0 

722 assert job_1.salary_max == 60000.0 

723 

724 # Verify second job 

725 assert job_2.is_scraped is True 

726 assert job_2.company == "Company B" 

727 assert job_2.title == "Developer B" 

728 assert job_2.salary_min == 45000.0 

729 assert job_2.salary_max == 65000.0 

730 

731 

732# ------------------------------------------------ GMAILSCRAPER METHODS ------------------------------------------------ 

733 

734 

735class TestProcessEmailJobs: 

736 """Test suite for the _process_email_jobs method.""" 

737 

738 def test_process_linkedin_email_jobs_success( 

739 self, 

740 gmail_scraper, 

741 session, 

742 linkedin_email_record, 

743 test_service_logs, 

744 ) -> None: 

745 """Test successful processing of LinkedIn email job ids""" 

746 

747 gmail_scraper._process_email( 

748 db=session, 

749 email_record=linkedin_email_record[0], 

750 service_log_entry=test_service_logs[0], 

751 ) 

752 

753 # noinspection PyTypeChecker 

754 scraped_jobs = session.query(ScrapedJob).filter(ScrapedJob.owner_id == linkedin_email_record[0].owner_id).all() 

755 assert len(scraped_jobs) == len(linkedin_email_record[1]) 

756 

757 def test_process_indeed_email_jobs_success( 

758 self, 

759 gmail_scraper, 

760 session, 

761 indeed_email_record, 

762 test_service_logs, 

763 ) -> None: 

764 """Test successful processing of Indeed email jobs.""" 

765 

766 gmail_scraper._process_email( 

767 db=session, 

768 email_record=indeed_email_record[0], 

769 service_log_entry=test_service_logs[0], 

770 ) 

771 

772 # noinspection PyTypeChecker 

773 scraped_jobs = session.query(ScrapedJob).filter(ScrapedJob.owner_id == indeed_email_record[0].owner_id).all() 

774 assert len(scraped_jobs) == len(indeed_email_record[1]) 

775 

776 def test_process_indeed_email_jobs_success_no_brightapi( 

777 self, 

778 gmail_scraper_with_brightapi_skip, 

779 session, 

780 indeed_email_record, 

781 test_service_logs, 

782 ) -> None: 

783 """Test successful processing of Indeed email jobs.""" 

784 

785 result = gmail_scraper_with_brightapi_skip._process_email( 

786 db=session, 

787 email_record=indeed_email_record[0], 

788 service_log_entry=test_service_logs[0], 

789 ) 

790 

791 # noinspection PyTypeChecker 

792 scraped_jobs = session.query(ScrapedJob).filter(ScrapedJob.owner_id == indeed_email_record[0].owner_id).all() 

793 assert len(scraped_jobs) == len(indeed_email_record[1]) 

794 assert len(result) == len(indeed_email_record[1]) 

795 

796 def test_process_linkedin_email_jobs_success_duplicates_different_owners( 

797 self, 

798 gmail_scraper, 

799 session, 

800 linkedin_email_record, 

801 linkedin_email_record_user2, 

802 test_service_logs, 

803 ) -> None: 

804 """Test successful processing of LinkedIn email job ids""" 

805 

806 gmail_scraper._process_email( 

807 db=session, 

808 email_record=linkedin_email_record[0], 

809 service_log_entry=test_service_logs[0], 

810 ) 

811 

812 gmail_scraper._process_email( 

813 db=session, 

814 email_record=linkedin_email_record_user2[0], 

815 service_log_entry=test_service_logs[0], 

816 ) 

817 

818 # noinspection PyTypeChecker 

819 scraped_jobs = session.query(ScrapedJob).filter(ScrapedJob.owner_id == linkedin_email_record[0].owner_id).all() 

820 assert len(scraped_jobs) == len(linkedin_email_record[1]) 

821 # noinspection PyTypeChecker 

822 scraped_jobs = ( 

823 session.query(ScrapedJob).filter(ScrapedJob.owner_id == linkedin_email_record_user2[0].owner_id).all() 

824 ) 

825 assert len(scraped_jobs) == len(linkedin_email_record_user2[1]) 

826 

827 def test_process_linkedin_email_jobs_success_duplicates_same_owner( 

828 self, 

829 gmail_scraper, 

830 session, 

831 linkedin_email_record, 

832 test_service_logs, 

833 ) -> None: 

834 """Test successful processing of LinkedIn email job ids""" 

835 

836 gmail_scraper._process_email( 

837 db=session, 

838 email_record=linkedin_email_record[0], 

839 service_log_entry=test_service_logs[0], 

840 ) 

841 

842 gmail_scraper._process_email( 

843 db=session, 

844 email_record=linkedin_email_record[0], 

845 service_log_entry=test_service_logs[0], 

846 ) 

847 

848 # noinspection PyTypeChecker 

849 scraped_jobs = session.query(ScrapedJob).filter(ScrapedJob.owner_id == linkedin_email_record[0].owner_id).all() 

850 assert len(scraped_jobs) == len(linkedin_email_record[1]) 

851 

852 

853class TestProcessUserEmails: 

854 """Test class for GmailScraper._process_user_emails method""" 

855 

856 def test_single_user( 

857 self, 

858 gmail_scraper, 

859 session, 

860 test_users, 

861 test_service_logs, 

862 linkedin_email_data, 

863 ) -> None: 

864 """Test successful processing of emails for a single user with LinkedIn email""" 

865 

866 # Mock get_email_ids to return emails only for first user 

867 with ( 

868 patch.object(gmail_scraper, "get_email_ids") as mock_get_email_ids, 

869 patch.object(gmail_scraper, "get_email_data") as mock_get_email_data, 

870 ): 

871 

872 # Setup mocks to be user-dependent 

873 def mock_get_email_ids_side_effect(user_email, _inbox_only, _timedelta_days) -> list[str]: 

874 """Mock get_email_ids to return emails only for first user""" 

875 if user_email == test_users[0].email: 

876 return [linkedin_email_data[0].external_email_id] 

877 else: 

878 return [] 

879 

880 def mock_get_email_data_side_effect(_email_id, user_email) -> schemas.JobAlertEmailCreate: 

881 """Mock get_email_data to return emails only for first user""" 

882 if user_email == test_users[0].email: 

883 return linkedin_email_data[0] 

884 raise ValueError(f"Unexpected call for user {user_email}") 

885 

886 mock_get_email_ids.side_effect = mock_get_email_ids_side_effect 

887 mock_get_email_data.side_effect = mock_get_email_data_side_effect 

888 

889 # Call the method 

890 result = gmail_scraper._process_user_emails( 

891 db=session, timedelta_days=1, service_log_entry=test_service_logs[0] 

892 ) 

893 

894 # Verify service log updates 

895 assert test_service_logs[0].users_processed_n == len(test_users) 

896 assert test_service_logs[0].emails_found_n == 1 

897 assert test_service_logs[0].emails_saved_n == 1 

898 

899 # Verify email was saved to database 

900 # noinspection PyTypeChecker 

901 saved_emails = ( 

902 session.query(JobAlertEmail) 

903 .filter(JobAlertEmail.external_email_id == linkedin_email_data[0].external_email_id) 

904 .all() 

905 ) 

906 assert len(saved_emails) == 1 

907 assert saved_emails[0].platform == linkedin_email_data[0].platform 

908 

909 # Verify jobs were created only for the first user 

910 # noinspection PyTypeChecker 

911 user1_jobs = session.query(ScrapedJob).filter(ScrapedJob.owner_id == test_users[0].id).all() 

912 assert len(user1_jobs) == len(linkedin_email_data[1]) 

913 

914 # Verify no jobs for other users 

915 for i in range(1, len(test_users)): 

916 # noinspection PyTypeChecker 

917 user_jobs = session.query(ScrapedJob).filter(ScrapedJob.owner_id == test_users[i].id).all() 

918 assert len(user_jobs) == 0 

919 

920 # Verify empty result (no job data for LinkedIn without scraping) 

921 assert result == {} 

922 

923 def test_multiple_users( 

924 self, 

925 gmail_scraper, 

926 session, 

927 test_users, 

928 test_service_logs, 

929 linkedin_email_data, 

930 indeed_email_data_user2, 

931 ) -> None: 

932 """Test successful processing of emails for multiple users with different email types""" 

933 

934 with ( 

935 patch.object(gmail_scraper, "get_email_ids") as mock_get_email_ids, 

936 patch.object(gmail_scraper, "get_email_data") as mock_get_email_data, 

937 ): 

938 

939 # Setup mocks to return different emails for different users 

940 def mock_get_email_ids_side_effect(user_email, _inbox_only, _timedelta_days) -> list[str]: 

941 """Mock get_email_ids() to return a list of email IDs for each user""" 

942 if user_email == test_users[0].email: 

943 return [linkedin_email_data[0].external_email_id] 

944 elif user_email == test_users[1].email: 

945 return [indeed_email_data_user2[0].external_email_id] 

946 return [] 

947 

948 def mock_get_email_data_side_effect(email_id, user_email) -> schemas.JobAlertEmailCreate: 

949 """Mock get_email_data() to return the email data for each user""" 

950 if user_email == test_users[0].email: 

951 return linkedin_email_data[0] 

952 elif user_email == test_users[1].email: 

953 return indeed_email_data_user2[0] 

954 raise ValueError(f"Unexpected call for user {user_email} and email {email_id}") 

955 

956 mock_get_email_ids.side_effect = mock_get_email_ids_side_effect 

957 mock_get_email_data.side_effect = mock_get_email_data_side_effect 

958 

959 # Call the method 

960 gmail_scraper._process_user_emails(db=session, timedelta_days=2, service_log_entry=test_service_logs[0]) 

961 

962 # Verify service log updates 

963 assert test_service_logs[0].users_processed_n == len(test_users) 

964 assert test_service_logs[0].emails_found_n == 2 

965 assert test_service_logs[0].emails_saved_n == 2 

966 assert test_service_logs[0].linkedin_job_n == len(linkedin_email_data[1]) 

967 assert test_service_logs[0].indeed_job_n == len(indeed_email_data_user2[1]) 

968 assert test_service_logs[0].jobs_extracted_n == len(linkedin_email_data[1]) + len( 

969 indeed_email_data_user2[1] 

970 ) 

971 

972 # Verify jobs were created for appropriate users 

973 # noinspection PyTypeChecker 

974 user1_jobs = session.query(ScrapedJob).filter(ScrapedJob.owner_id == test_users[0].id).all() 

975 # noinspection PyTypeChecker 

976 user2_jobs = session.query(ScrapedJob).filter(ScrapedJob.owner_id == test_users[1].id).all() 

977 assert len(user1_jobs) == len(linkedin_email_data[1]) 

978 assert len(user2_jobs) == len(indeed_email_data_user2[1]) 

979 

980 # Verify no jobs for remaining users (if any) 

981 for i in range(2, len(test_users)): 

982 # noinspection PyTypeChecker 

983 user_jobs = session.query(ScrapedJob).filter(ScrapedJob.owner_id == test_users[i].id).all() 

984 assert len(user_jobs) == 0 

985 

986 def test_multiple_users_same_jobs( 

987 self, 

988 gmail_scraper, 

989 session, 

990 test_users, 

991 test_service_logs, 

992 linkedin_email_data, 

993 linkedin_email_data_user2, 

994 ) -> None: 

995 """Test successful processing of emails for multiple users with different email types""" 

996 

997 with ( 

998 patch.object(gmail_scraper, "get_email_ids") as mock_get_email_ids, 

999 patch.object(gmail_scraper, "get_email_data") as mock_get_email_data, 

1000 ): 

1001 

1002 # Setup mocks to return different emails for different users 

1003 def mock_get_email_ids_side_effect(user_email, _inbox_only, _timedelta_days) -> list[str]: 

1004 """Mock function to return different emails for different users""" 

1005 if user_email == test_users[0].email: 

1006 return [linkedin_email_data[0].external_email_id] 

1007 elif user_email == test_users[1].email: 

1008 return [linkedin_email_data_user2[0].external_email_id] 

1009 return [] 

1010 

1011 def mock_get_email_data_side_effect(email_id, user_email) -> schemas.JobAlertEmailCreate: 

1012 """Mock method to return job data for a given email ID and user email""" 

1013 if user_email == test_users[0].email: 

1014 return linkedin_email_data[0] 

1015 elif user_email == test_users[1].email: 

1016 return linkedin_email_data_user2[0] 

1017 raise ValueError(f"Unexpected call for user {user_email} and email {email_id}") 

1018 

1019 mock_get_email_ids.side_effect = mock_get_email_ids_side_effect 

1020 mock_get_email_data.side_effect = mock_get_email_data_side_effect 

1021 

1022 # Call the method 

1023 gmail_scraper._process_user_emails(db=session, timedelta_days=2, service_log_entry=test_service_logs[0]) 

1024 

1025 # Verify service log updates 

1026 assert test_service_logs[0].users_processed_n == len(test_users) 

1027 assert test_service_logs[0].emails_found_n == 2 

1028 assert test_service_logs[0].emails_saved_n == 2 

1029 assert test_service_logs[0].linkedin_job_n == len(linkedin_email_data[1]) + len( 

1030 linkedin_email_data_user2[1] 

1031 ) 

1032 assert test_service_logs[0].indeed_job_n == 0 

1033 assert test_service_logs[0].jobs_extracted_n == len(linkedin_email_data[1]) + len( 

1034 linkedin_email_data_user2[1] 

1035 ) 

1036 

1037 # Verify jobs were created for appropriate users 

1038 # noinspection PyTypeChecker 

1039 user1_jobs = session.query(ScrapedJob).filter(ScrapedJob.owner_id == test_users[0].id).all() 

1040 # noinspection PyTypeChecker 

1041 user2_jobs = session.query(ScrapedJob).filter(ScrapedJob.owner_id == test_users[1].id).all() 

1042 assert len(user1_jobs) == len(linkedin_email_data[1]) 

1043 assert len(user2_jobs) == len(linkedin_email_data_user2[1]) 

1044 

1045 # Verify no jobs for remaining users (if any) 

1046 for i in range(2, len(test_users)): 

1047 # noinspection PyTypeChecker 

1048 user_jobs = session.query(ScrapedJob).filter(ScrapedJob.owner_id == test_users[i].id).all() 

1049 assert len(user_jobs) == 0 

1050 

1051 def test_skip_brightdata( 

1052 self, 

1053 gmail_scraper_with_brightapi_skip, 

1054 session, 

1055 test_users, 

1056 test_service_logs, 

1057 indeed_email_data, 

1058 ) -> None: 

1059 """Test successful processing of emails for multiple users with different email types""" 

1060 

1061 with ( 

1062 patch.object(gmail_scraper_with_brightapi_skip, "get_email_ids") as mock_get_email_ids, 

1063 patch.object(gmail_scraper_with_brightapi_skip, "get_email_data") as mock_get_email_data, 

1064 ): 

1065 

1066 # Setup mocks to return different emails for different users 

1067 def mock_get_email_ids_side_effect(user_email, _inbox_only, _timedelta_days) -> list[str]: 

1068 """Mock get_email_ids method to return emails only for first user""" 

1069 if user_email == test_users[0].email: 

1070 return [indeed_email_data[0].external_email_id] 

1071 return [] 

1072 

1073 def mock_get_email_data_side_effect(email_id, user_email) -> schemas.JobAlertEmailCreate: 

1074 """Mock get_email_data method to return email data only for first user""" 

1075 if user_email == test_users[0].email: 

1076 return indeed_email_data[0] 

1077 raise ValueError(f"Unexpected call for user {user_email} and email {email_id}") 

1078 

1079 mock_get_email_ids.side_effect = mock_get_email_ids_side_effect 

1080 mock_get_email_data.side_effect = mock_get_email_data_side_effect 

1081 

1082 # Call the method 

1083 result = gmail_scraper_with_brightapi_skip._process_user_emails( 

1084 db=session, timedelta_days=2, service_log_entry=test_service_logs[0] 

1085 ) 

1086 

1087 assert len(result) == 23 

1088 

1089 

1090class TestScrapeRemainingJobs: 

1091 """Test cases for the _scrape_remaining_jobs method""" 

1092 

1093 @staticmethod 

1094 def _scraped_jobs(session, email_record) -> list[ScrapedJob]: 

1095 """Fixture to create Indeed scraped jobs for multiple users""" 

1096 

1097 scraped_jobs = [] 

1098 owner_id = email_record[0].owner_id 

1099 for job_id in email_record[1]: 

1100 # noinspection PyArgumentList 

1101 scraped_job = ScrapedJob(external_job_id=job_id, owner_id=owner_id) 

1102 scraped_job.emails.append(email_record[0]) 

1103 session.add(scraped_job) 

1104 scraped_jobs.append(scraped_job) 

1105 session.commit() 

1106 return scraped_jobs 

1107 

1108 @pytest.fixture 

1109 def indeed_scraped_jobs(self, test_users, session, indeed_email_record) -> list[ScrapedJob]: 

1110 """Fixture to create Indeed scraped jobs for multiple users""" 

1111 

1112 return self._scraped_jobs(session, indeed_email_record) 

1113 

1114 @pytest.fixture 

1115 def indeed_scraped_jobs_user2(self, test_users, session, indeed_email_record_user2) -> list[ScrapedJob]: 

1116 """Fixture to create Indeed scraped jobs for multiple users""" 

1117 

1118 return self._scraped_jobs(session, indeed_email_record_user2) 

1119 

1120 @pytest.fixture 

1121 def linkedin_scraped_jobs(self, test_users, session, linkedin_email_record) -> list[ScrapedJob]: 

1122 """Fixture to create Indeed scraped jobs for multiple users""" 

1123 

1124 return self._scraped_jobs(session, linkedin_email_record) 

1125 

1126 def test_indeed_success( 

1127 self, 

1128 indeed_scraped_jobs, 

1129 test_service_logs, 

1130 gmail_scraper, 

1131 session, 

1132 ) -> None: 

1133 """Test successful processing of Indeed email jobs""" 

1134 

1135 with patch("app.eis.email_scraper.IndeedJobScraper") as mock_scraper_class: 

1136 # Create mock instance 

1137 mock_scraper_instance = MockIndeedJobScraper(INDEED_JOB_IDS) 

1138 mock_scraper_class.return_value = mock_scraper_instance 

1139 

1140 # Call the method we're testing 

1141 gmail_scraper._scrape_remaining_jobs(session, test_service_logs[0], {}) 

1142 

1143 # Verify all jobs are now scraped 

1144 unscraped_jobs_after = session.query(ScrapedJob).filter().all() 

1145 for job in unscraped_jobs_after: 

1146 assert job.is_scraped 

1147 assert job.scrape_error is None 

1148 

1149 def test_indeed_nobrightapi_success( 

1150 self, 

1151 indeed_scraped_jobs, 

1152 test_service_logs, 

1153 gmail_scraper_with_brightapi_skip, 

1154 session, 

1155 ) -> None: 

1156 """Test successful processing of Indeed email jobs""" 

1157 

1158 with patch("app.eis.email_scraper.IndeedJobScraper") as mock_scraper_class: 

1159 # Create mock instance 

1160 mock_scraper_instance = MockIndeedJobScraper(INDEED_JOB_IDS) 

1161 mock_scraper_class.return_value = mock_scraper_instance 

1162 

1163 # Call the method we're testing 

1164 jobs = extract_indeed_jobs_from_email(indeed_scraped_jobs[0].emails[0].body) 

1165 job_data = {} 

1166 for job in jobs: 

1167 job_ids = gmail_scraper_with_brightapi_skip.extract_indeed_job_ids(job["job"]["url"]) 

1168 if job_ids: # Make sure we have at least one job ID 

1169 job_data[job_ids[0]] = job 

1170 gmail_scraper_with_brightapi_skip._scrape_remaining_jobs(session, test_service_logs[0], job_data) 

1171 

1172 # Verify all jobs are now scraped 

1173 jobs_after = session.query(ScrapedJob).filter().all() 

1174 for job in jobs_after: 

1175 assert job.is_scraped 

1176 assert not job.is_failed 

1177 

1178 def test_indeed_nobrightapi_fail( 

1179 self, 

1180 indeed_scraped_jobs, 

1181 test_service_logs, 

1182 gmail_scraper_with_brightapi_skip, 

1183 session, 

1184 ) -> None: 

1185 """Test successful processing of Indeed email jobs""" 

1186 

1187 with patch("app.eis.email_scraper.IndeedJobScraper") as mock_scraper_class: 

1188 # Create mock instance 

1189 mock_scraper_instance = MockIndeedJobScraper(INDEED_JOB_IDS) 

1190 mock_scraper_class.return_value = mock_scraper_instance 

1191 

1192 # Call the method we're testing 

1193 gmail_scraper_with_brightapi_skip._scrape_remaining_jobs(session, test_service_logs[0], {}) 

1194 

1195 # Verify all jobs are now scraped 

1196 jobs_after = session.query(ScrapedJob).filter().all() 

1197 for job in jobs_after: 

1198 assert job.is_scraped 

1199 assert job.is_failed 

1200 

1201 def test_linkedin_success( 

1202 self, 

1203 linkedin_scraped_jobs, 

1204 test_service_logs, 

1205 gmail_scraper, 

1206 session, 

1207 ) -> None: 

1208 """Test successful processing of Indeed email jobs""" 

1209 

1210 with patch("app.eis.email_scraper.LinkedinJobScraper") as mock_scraper_class: 

1211 # Create mock instance 

1212 mock_scraper_instance = MockLinkedinJobScraper(INDEED_JOB_IDS) 

1213 mock_scraper_class.return_value = mock_scraper_instance 

1214 

1215 # Call the method we're testing 

1216 gmail_scraper._scrape_remaining_jobs(session, test_service_logs[0], {}) 

1217 

1218 # Verify all jobs are now scraped 

1219 jobs_after = session.query(ScrapedJob).filter().all() 

1220 for job in jobs_after: 

1221 assert job.is_scraped 

1222 assert not job.is_failed 

1223 

1224 def test_indeed_multiple_users_shared_jobs_success( 

1225 self, 

1226 indeed_scraped_jobs, 

1227 indeed_scraped_jobs_user2, 

1228 test_service_logs, 

1229 gmail_scraper, 

1230 session, 

1231 ) -> None: 

1232 """Test successful processing of Indeed email jobs""" 

1233 from unittest.mock import patch, MagicMock 

1234 

1235 with patch("app.eis.email_scraper.IndeedJobScraper") as mock_scraper_class: 

1236 # Create mock instance 

1237 mock_scraper_instance = MockIndeedJobScraper(INDEED_JOB_IDS) 

1238 

1239 # Wrap the scrape_job method with a MagicMock to track calls 

1240 original_scrape_job = mock_scraper_instance.scrape_job 

1241 mock_scraper_instance.scrape_job = MagicMock(side_effect=original_scrape_job) 

1242 

1243 mock_scraper_class.return_value = mock_scraper_instance 

1244 

1245 # Call the method we're testing 

1246 gmail_scraper._scrape_remaining_jobs(session, test_service_logs[0], {}) 

1247 

1248 # Verify all jobs are now scraped 

1249 jobs_after = session.query(ScrapedJob).filter().all() 

1250 assert len(jobs_after) == len(indeed_scraped_jobs) + len(indeed_scraped_jobs_user2) 

1251 for job in jobs_after: 

1252 assert job.is_scraped 

1253 assert not job.is_failed 

1254 

1255 # Count how many times scrape_job() was called 

1256 scrape_job_call_count = mock_scraper_instance.scrape_job.call_count 

1257 assert scrape_job_call_count == len(indeed_scraped_jobs)