Coverage for backend / app / job_email_scraping / location_parser.py: 100%

51 statements  

« prev     ^ index     » next       coverage.py v7.13.5, created at 2026-03-17 21:34 +0000

1"""Location Parser Module 

2 

3Extracts attendance type from job posting location strings and returns 

4a cleaned location string with attendance indicators removed.""" 

5 

6import re 

7 

8 

9class LocationParser: 

10 """Parser for extracting attendance type from job location strings""" 

11 

12 def __init__(self) -> None: 

13 # Attendance type indicators 

14 self.attendance_indicators = { 

15 "remote": ["remote", "work from home", "wfh", "anywhere", "global", "fully remote"], 

16 "hybrid": ["hybrid", "flexible"], 

17 "on-site": ["on-site", "office", "in-person", "on site", "onsite"], 

18 } 

19 

20 def extract_attendance_type(self, location_str: str) -> str | None: 

21 """Extract attendance type from the location string 

22 :param location_str: Raw location string from the job posting 

23 :return: Attendance type ("remote", "hybrid", "on-site") if found, else None""" 

24 

25 location_lower = location_str.lower() 

26 

27 # Check if both remote and office/on-site indicators are present -> hybrid 

28 has_remote = any(indicator in location_lower for indicator in self.attendance_indicators["remote"]) 

29 has_office = any(indicator in location_lower for indicator in self.attendance_indicators["on-site"]) 

30 

31 if has_remote and has_office: 

32 return "hybrid" 

33 

34 # Check for explicit hybrid indicators 

35 for indicator in self.attendance_indicators["hybrid"]: 

36 if indicator in location_lower: 

37 return "hybrid" 

38 

39 # Check remote indicators 

40 if has_remote: 

41 return "remote" 

42 

43 # Check on-site indicators 

44 if has_office: 

45 return "on-site" 

46 

47 return None 

48 

49 def remove_attendance_indicators(self, location_str: str) -> str: 

50 """Remove attendance type indicators from location string 

51 :param location_str: Raw location string 

52 :return: Cleaned location string with attendance indicators removed""" 

53 

54 cleaned = location_str 

55 

56 # Collect all indicators to remove 

57 all_indicators = [] 

58 for indicators in self.attendance_indicators.values(): 

59 all_indicators.extend(indicators) 

60 

61 # Sort by length (longest first) to avoid partial matches 

62 all_indicators.sort(key=len, reverse=True) 

63 

64 # Remove each indicator (case-insensitive) 

65 for indicator in all_indicators: 

66 pattern = re.compile(re.escape(indicator), re.IGNORECASE) 

67 cleaned = pattern.sub("", cleaned) 

68 

69 # Remove common conjunctions/connectors used between attendance types 

70 connectors = [r"\bor\b", r"\band\b", r"\bor/and\b", r"\b&\b"] 

71 for connector in connectors: 

72 cleaned = re.sub(connector, "", cleaned, flags=re.IGNORECASE) 

73 

74 # Remove empty parentheses, brackets, and braces 

75 cleaned = re.sub(r"\(\s*\)", "", cleaned) # Empty () 

76 cleaned = re.sub(r"\[\s*]", "", cleaned) # Empty [] 

77 cleaned = re.sub(r"\{\s*}", "", cleaned) # Empty {} 

78 

79 # Clean up extra whitespace, commas, and separators 

80 cleaned = re.sub(r"\s*[,/|•·-]\s*", ", ", cleaned) # Normalize separators 

81 cleaned = re.sub(r"\s+", " ", cleaned) # Multiple spaces to single 

82 cleaned = re.sub(r"^[,\s]+|[,\s]+$", "", cleaned) # Trim leading/trailing 

83 cleaned = re.sub(r",+", ",", cleaned) # Multiple commas to single 

84 cleaned = re.sub(r",\s*,", ",", cleaned) # Remove duplicate commas with spaces 

85 

86 # If the result is only wrapped in parentheses/brackets, unwrap it 

87 cleaned = re.sub(r"^\(\s*(.*?)\s*\)$", r"\1", cleaned) # (content) -> content 

88 cleaned = re.sub(r"^\[\s*(.*?)\s*]$", r"\1", cleaned) # [content] -> content 

89 cleaned = re.sub(r"^\{\s*(.*?)\s*}$", r"\1", cleaned) # {content} -> content 

90 

91 return cleaned.strip() 

92 

93 def parse_location(self, location_str: str) -> tuple[str, str | None]: 

94 """Parse a location string and extract the cleaned location and attendance type 

95 :param location_str: Raw location string from the job posting 

96 :return: Tuple of (cleaned location string, attendance_type string or None)""" 

97 

98 if not location_str: 

99 return "", None 

100 

101 location_str = location_str.strip() 

102 

103 if not location_str: 

104 return "", None 

105 

106 # Extract attendance type first 

107 attendance_type = self.extract_attendance_type(location_str) 

108 

109 # Remove attendance indicators from location string 

110 cleaned_location = self.remove_attendance_indicators(location_str) 

111 

112 return cleaned_location, attendance_type