#4591

pending completion

Build # #4591

Build Type

push

coveralls-python

Committed by

suhaibmujahid

Commit Message

Add a new rule to automatically file bugs for new actionable crashes

Run Details

646 of 3400 branches covered (19.0%)

457 of 457 new or added lines in 4 files covered. (100.0%)

1827 of 8458 relevant lines covered (21.6%)

0.22 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

0.0

/bugbot/crash/analyzer.py

# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this file,
# You can obtain one at http://mozilla.org/MPL/2.0/.

import itertools
from datetime import timedelta
from functools import cached_property
from typing import Iterable, Iterator

from libmozdata import bugzilla, clouseau, socorro
from libmozdata import utils as lmdutils

from bugbot.components import ComponentName
from bugbot.crash import socorro_util


class NoCrashReportFoundError(Exception):
    """Raised when no crash report is found with the required criteria."""


class ClouseauReportsAnalyzer:
    REGRESSOR_MINIMUM_SCORE: int = 8

    def __init__(self, reports: Iterable[dict]):
        self._clouseau_reports = reports

    @cached_property
    def max_score(self):
        if not self._clouseau_reports:
            return 0
        return max(report["max_score"] for report in self._clouseau_reports)

    @cached_property
    def regressed_by_candidate_ids(self) -> set[int]:
        minimum_accepted_score = max(self.REGRESSOR_MINIMUM_SCORE, self.max_score)
        return {
            changeset["bug_id"]
            for report in self._clouseau_reports
            if report["max_score"] >= minimum_accepted_score
            for changeset in report["changesets"]
            if changeset["max_score"] >= minimum_accepted_score
            and not changeset["is_merge"]
            and not changeset["is_backedout"]
        }

    @cached_property
    def regressed_by(self) -> int | None:
        bug_ids = self.regressed_by_candidate_ids
        if len(bug_ids) == 1:
            return next(iter(bug_ids))
        return None

    @cached_property
    def regressed_by_candidate_bugs(self) -> list[dict]:
        def handler(bug: dict, data: list):
            data.append(bug)

        bugs: list[dict] = []
        bugzilla.Bugzilla(
            bugids=self.regressed_by_candidate_ids,
            include_fields=[
                "id",
                "assigned_to",
                "product",
                "component",
            ],
            bughandler=handler,
            bugdata=bugs,
        ).wait()

        return bugs

    @cached_property
    def regressed_by_author(self) -> dict | None:
        if not self.regressed_by:
            return None

        bug = self.regressed_by_candidate_bugs[0]
        assert bug["id"] == self.regressed_by
        return bug["assigned_to_details"]

    @cached_property
    def crash_component(self) -> ComponentName:
        candidate_components = {
            ComponentName(bug["product"], bug["component"])
            for bug in self.regressed_by_candidate_bugs
        }
        if len(candidate_components) == 1:
            return next(iter(candidate_components))

        return ComponentName("Core", "General")


class SocorroInfoAnalyzer(socorro_util.SignatureStats):
    __bugzilla_os_values = set(bugzilla.BugFields.fetch_field_values("op_sys"))
    __bugzilla_cpu_values = {
        value.lower(): value
        for value in bugzilla.BugFields.fetch_field_values("rep_platform")
    }

    @classmethod
    def to_bugzilla_op_sys(cls, op_sys: str) -> str:
        if op_sys in cls.__bugzilla_os_values:
            return op_sys

        if op_sys.startswith("OS X ") or op_sys.startswith("macOS "):
            op_sys = "macOS"
        elif op_sys.startswith("Windows"):
            op_sys = "Windows"
        elif "Linux" in op_sys or op_sys.startswith("Ubuntu"):
            op_sys = "Linux"
        else:
            op_sys = "Other"

        return op_sys

    @property
    def bugzilla_op_sys(self) -> str:
        all_op_sys = {
            self.to_bugzilla_op_sys(op_sys["term"])
            for op_sys in self.signature["facets"]["platform_pretty_version"]
        }

        if len(all_op_sys) > 1:
            # TODO: explain this workaround
            all_op_sys = {op_sys.split(" ")[0] for op_sys in all_op_sys}

        if len(all_op_sys) == 2 and "Other" in all_op_sys:
            all_op_sys.remove("Other")

        if len(all_op_sys) == 1:
            return next(iter(all_op_sys))

        if len(all_op_sys) == 0:
            return "Unspecified"

        return "All"

    @classmethod
    def to_bugzilla_cpu(cls, cpu: str) -> str:
        return cls.__bugzilla_cpu_values.get(cpu, "Other")

    @property
    def bugzilla_cpu_arch(self) -> str:
        all_cpu_arch = {
            self.to_bugzilla_cpu(cpu["term"])
            for cpu in self.signature["facets"]["cpu_arch"]
        }

        if len(all_cpu_arch) == 2 and "Other" in all_cpu_arch:
            all_cpu_arch.remove("Other")

        if len(all_cpu_arch) == 1:
            return next(iter(all_cpu_arch))

        if len(all_cpu_arch) == 0:
            return "Unspecified"

        return "All"

    @property
    def num_user_comments(self) -> int:
        # TODO: count useful/intrusting user comments (e.g., exclude one word comments)
        return self.signature["facets"]["cardinality_user_comments"]["value"]

    @property
    def has_user_comments(self) -> bool:
        return self.num_user_comments > 0

    @property
    def top_proto_signature(self) -> str:
        return self.signature["facets"]["proto_signature"][0]["term"]

    @property
    def num_top_proto_signature_crashes(self) -> int:
        return self.signature["facets"]["proto_signature"][0]["count"]

    @property
    def build_ids(self) -> Iterator[int]:
        for build_id in self.signature["facets"]["build_id"]:
            yield build_id["term"]

    @property
    def top_build_id(self) -> int:
        return self.signature["facets"]["build_id"][0]["term"]


class SignatureAnalyzer(SocorroInfoAnalyzer, ClouseauReportsAnalyzer):
    platforms = [
        {"short_name": "win", "name": "Windows"},
        {"short_name": "mac", "name": "Mac OS X"},
        {"short_name": "lin", "name": "Linux"},
        {"short_name": "and", "name": "Android"},
        {"short_name": "unknown", "name": "Unknown"},
    ]

    def __init__(
        self,
        signature: dict,
        num_total_crashes: int,
        clouseau_reports: list[dict],
    ):
        SocorroInfoAnalyzer.__init__(
            self, signature, num_total_crashes, platforms=self.platforms
        )
        ClouseauReportsAnalyzer.__init__(self, clouseau_reports)

    def _fetch_crash_reports(
        self,
        proto_signature: str,
        build_id: int | Iterable[int],
        limit: int = 1,
    ) -> Iterator[dict]:
        params = {
            "proto_signature": "=" + proto_signature,
            "build_id": build_id,
            "_columns": [
                "uuid",
            ],
            "_results_number": limit,
        }

        def handler(res: dict, data: dict):
            data.update(res)

        data: dict = {}
        socorro.SuperSearch(params=params, handler=handler, handlerdata=data).wait()

        yield from data["hits"]

    def fetch_representing_processed_crash(self) -> dict:
        limit_to_top_proto_signature = (
            self.num_top_proto_signature_crashes / self.num_crashes > 0.6
        )

        reports = itertools.chain(
            # Reports with a higher score from clouseau are more likely to be
            # useful.
            sorted(
                self._clouseau_reports,
                key=lambda report: report["max_score"],
                reverse=True,
            ),
            # Next we try find reports from the top crashing build because they
            # are likely to be representative.
            self._fetch_crash_reports(self.top_proto_signature, self.top_build_id),
            self._fetch_crash_reports(self.top_proto_signature, self.build_ids),
        )
        for report in reports:
            uuid = report["uuid"]
            processed_crash = socorro.ProcessedCrash.get_processed(uuid)[uuid]
            if (
                not limit_to_top_proto_signature
                or processed_crash["proto_signature"] == self.top_proto_signature
            ):
                # TODO(investigate): maybe we should check if the stack is
                # corrupted (ask gsvelto or willkg about how to detect that)
                return processed_crash

        raise NoCrashReportFoundError(
            f"No crash report found with the most frequent proto signature for {self.signature_term}."
        )


class SignaturesDataFetcher:
    def __init__(
        self,
        signatures,
        product: str = "Firefox",
        channel: str = "nightly",
    ):
        self._signatures = signatures
        self._product = product
        self._channel = channel

    def fetch_clouseau_crash_reports(self) -> dict[str, list]:
        return clouseau.Reports.get_by_signatures(
            self._signatures,
            product=self._product,
            channel=self._channel,
        )

    def fetch_socorro_info(self) -> tuple[list[dict], int]:
        # TODO(investigate): should we increase the duration to 6 months?
        duration = timedelta(weeks=1)
        end_date = lmdutils.get_date_ymd("today")
        start_date = end_date - duration
        date_range = socorro.SuperSearch.get_search_date(start_date, end_date)

        params = {
            "product": self._product,
            # TODO(investigate): should we included all release channels?
            "release_channel": self._channel,
            # TODO(investigate): should we limit based on the build date as well?
            "date": date_range,
            # TODO: split signatures into chunks to avoid very long query URLs
            "signature": ["=" + signature for signature in self._signatures],
            "_aggs.signature": [
                "build_id",
                "cpu_arch",
                "proto_signature",
                "_cardinality.user_comments",
                "cpu_arch",
                "platform_pretty_version",
                # The following are needed for SignatureStats:
                "platform",
                "is_garbage_collecting",
                "_cardinality.install_time",
                "startup_crash",
                "_histogram.uptime",
                "process_type",
            ],
            "_results_number": 0,
            "_facets_size": 10000,
        }

        def handler(search_results: dict, data: dict):
            data["num_total_crashes"] = search_results["total"]
            data["signatures"] = search_results["facets"]["signature"]

        data: dict = {}
        socorro.SuperSearchUnredacted(
            params=params,
            handler=handler,
            handlerdata=data,
        ).wait()

        return data["signatures"], data["num_total_crashes"]

    def analyze(self) -> list[SignatureAnalyzer]:
        clouseau_reports = self.fetch_clouseau_crash_reports()
        signatures, num_total_crashes = self.fetch_socorro_info()

        return [
            SignatureAnalyzer(
                signature,
                num_total_crashes,
                clouseau_reports[signature["term"]],
            )
            for signature in signatures
            # TODO(investigate): For now, we are ignoring signatures that are
            # not analyzed by clouseau. We should investigate why they are not
            # analyzed and whether we should include them.
            if signature["term"] in clouseau_reports
        ]

1	# This Source Code Form is subject to the terms of the Mozilla Public
2	# License, v. 2.0. If a copy of the MPL was not distributed with this file,
3	# You can obtain one at http://mozilla.org/MPL/2.0/.
4
5	import itertools	×
6	from datetime import timedelta	×
7	from functools import cached_property	×
8	from typing import Iterable, Iterator	×
9
10	from libmozdata import bugzilla, clouseau, socorro	×
11	from libmozdata import utils as lmdutils	×
12
13	from bugbot.components import ComponentName	×
14	from bugbot.crash import socorro_util	×
15
16
17	class NoCrashReportFoundError(Exception):	×
18	"""Raised when no crash report is found with the required criteria."""
19
20
21	class ClouseauReportsAnalyzer:	×
22	REGRESSOR_MINIMUM_SCORE: int = 8	×
23
24	def __init__(self, reports: Iterable[dict]):	×
25	self._clouseau_reports = reports	×
26
27	@cached_property	×
28	def max_score(self):	×
29	if not self._clouseau_reports:	×
30	return 0	×
31	return max(report["max_score"] for report in self._clouseau_reports)	×
32
33	@cached_property	×
34	def regressed_by_candidate_ids(self) -> set[int]:	×
35	minimum_accepted_score = max(self.REGRESSOR_MINIMUM_SCORE, self.max_score)	×
36	return {	×
37	changeset["bug_id"]
38	for report in self._clouseau_reports
39	if report["max_score"] >= minimum_accepted_score
40	for changeset in report["changesets"]
41	if changeset["max_score"] >= minimum_accepted_score
42	and not changeset["is_merge"]
43	and not changeset["is_backedout"]
44	}
45
46	@cached_property	×
47	def regressed_by(self) -> int \| None:	×
48	bug_ids = self.regressed_by_candidate_ids	×
49	if len(bug_ids) == 1:	×
50	return next(iter(bug_ids))	×
51	return None	×
52
53	@cached_property	×
54	def regressed_by_candidate_bugs(self) -> list[dict]:	×
55	def handler(bug: dict, data: list):	×
56	data.append(bug)	×
57
58	bugs: list[dict] = []	×
59	bugzilla.Bugzilla(	×
60	bugids=self.regressed_by_candidate_ids,
61	include_fields=[
62	"id",
63	"assigned_to",
64	"product",
65	"component",
66	],
67	bughandler=handler,
68	bugdata=bugs,
69	).wait()
70
71	return bugs	×
72
73	@cached_property	×
74	def regressed_by_author(self) -> dict \| None:	×
75	if not self.regressed_by:	×
76	return None	×
77
78	bug = self.regressed_by_candidate_bugs[0]	×
79	assert bug["id"] == self.regressed_by	×
80	return bug["assigned_to_details"]	×
81
82	@cached_property	×
83	def crash_component(self) -> ComponentName:	×
84	candidate_components = {	×
85	ComponentName(bug["product"], bug["component"])
86	for bug in self.regressed_by_candidate_bugs
87	}
88	if len(candidate_components) == 1:	×
89	return next(iter(candidate_components))	×
90
91	return ComponentName("Core", "General")	×
92
93
94	class SocorroInfoAnalyzer(socorro_util.SignatureStats):	×
95	__bugzilla_os_values = set(bugzilla.BugFields.fetch_field_values("op_sys"))	×
96	__bugzilla_cpu_values = {	×
97	value.lower(): value
98	for value in bugzilla.BugFields.fetch_field_values("rep_platform")
99	}
100
101	@classmethod	×
102	def to_bugzilla_op_sys(cls, op_sys: str) -> str:	×
103	if op_sys in cls.__bugzilla_os_values:	×
104	return op_sys	×
105
106	if op_sys.startswith("OS X ") or op_sys.startswith("macOS "):	×
107	op_sys = "macOS"	×
108	elif op_sys.startswith("Windows"):	×
109	op_sys = "Windows"	×
110	elif "Linux" in op_sys or op_sys.startswith("Ubuntu"):	×
111	op_sys = "Linux"	×
112	else:
113	op_sys = "Other"	×
114
115	return op_sys	×
116
117	@property	×
118	def bugzilla_op_sys(self) -> str:	×
119	all_op_sys = {	×
120	self.to_bugzilla_op_sys(op_sys["term"])
121	for op_sys in self.signature["facets"]["platform_pretty_version"]
122	}
123
124	if len(all_op_sys) > 1:	×
125	# TODO: explain this workaround
126	all_op_sys = {op_sys.split(" ")[0] for op_sys in all_op_sys}	×
127
128	if len(all_op_sys) == 2 and "Other" in all_op_sys:	×
129	all_op_sys.remove("Other")	×
130
131	if len(all_op_sys) == 1:	×
132	return next(iter(all_op_sys))	×
133
134	if len(all_op_sys) == 0:	×
135	return "Unspecified"	×
136
137	return "All"	×
138
139	@classmethod	×
140	def to_bugzilla_cpu(cls, cpu: str) -> str:	×
141	return cls.__bugzilla_cpu_values.get(cpu, "Other")	×
142
143	@property	×
144	def bugzilla_cpu_arch(self) -> str:	×
145	all_cpu_arch = {	×
146	self.to_bugzilla_cpu(cpu["term"])
147	for cpu in self.signature["facets"]["cpu_arch"]
148	}
149
150	if len(all_cpu_arch) == 2 and "Other" in all_cpu_arch:	×
151	all_cpu_arch.remove("Other")	×
152
153	if len(all_cpu_arch) == 1:	×
154	return next(iter(all_cpu_arch))	×
155
156	if len(all_cpu_arch) == 0:	×
157	return "Unspecified"	×
158
159	return "All"	×
160
161	@property	×
162	def num_user_comments(self) -> int:	×
163	# TODO: count useful/intrusting user comments (e.g., exclude one word comments)
164	return self.signature["facets"]["cardinality_user_comments"]["value"]	×
165
166	@property	×
167	def has_user_comments(self) -> bool:	×
168	return self.num_user_comments > 0	×
169
170	@property	×
171	def top_proto_signature(self) -> str:	×
172	return self.signature["facets"]["proto_signature"][0]["term"]	×
173
174	@property	×
175	def num_top_proto_signature_crashes(self) -> int:	×
176	return self.signature["facets"]["proto_signature"][0]["count"]	×
177
178	@property	×
179	def build_ids(self) -> Iterator[int]:	×
180	for build_id in self.signature["facets"]["build_id"]:	×
181	yield build_id["term"]	×
182
183	@property	×
184	def top_build_id(self) -> int:	×
185	return self.signature["facets"]["build_id"][0]["term"]	×
186
187
188	class SignatureAnalyzer(SocorroInfoAnalyzer, ClouseauReportsAnalyzer):	×
189	platforms = [	×
190	{"short_name": "win", "name": "Windows"},
191	{"short_name": "mac", "name": "Mac OS X"},
192	{"short_name": "lin", "name": "Linux"},
193	{"short_name": "and", "name": "Android"},
194	{"short_name": "unknown", "name": "Unknown"},
195	]
196
197	def __init__(	×
198	self,
199	signature: dict,
200	num_total_crashes: int,
201	clouseau_reports: list[dict],
202	):
203	SocorroInfoAnalyzer.__init__(	×
204	self, signature, num_total_crashes, platforms=self.platforms
205	)
206	ClouseauReportsAnalyzer.__init__(self, clouseau_reports)	×
207
208	def _fetch_crash_reports(	×
209	self,
210	proto_signature: str,
211	build_id: int \| Iterable[int],
212	limit: int = 1,
213	) -> Iterator[dict]:
214	params = {	×
215	"proto_signature": "=" + proto_signature,
216	"build_id": build_id,
217	"_columns": [
218	"uuid",
219	],
220	"_results_number": limit,
221	}
222
223	def handler(res: dict, data: dict):	×
224	data.update(res)	×
225
226	data: dict = {}	×
227	socorro.SuperSearch(params=params, handler=handler, handlerdata=data).wait()	×
228
229	yield from data["hits"]	×
230
231	def fetch_representing_processed_crash(self) -> dict:	×
232	limit_to_top_proto_signature = (	×
233	self.num_top_proto_signature_crashes / self.num_crashes > 0.6
234	)
235
236	reports = itertools.chain(	×
237	# Reports with a higher score from clouseau are more likely to be
238	# useful.
239	sorted(
240	self._clouseau_reports,
241	key=lambda report: report["max_score"],
242	reverse=True,
243	),
244	# Next we try find reports from the top crashing build because they
245	# are likely to be representative.
246	self._fetch_crash_reports(self.top_proto_signature, self.top_build_id),
247	self._fetch_crash_reports(self.top_proto_signature, self.build_ids),
248	)
249	for report in reports:	×
250	uuid = report["uuid"]	×
251	processed_crash = socorro.ProcessedCrash.get_processed(uuid)[uuid]	×
252	if (	×
253	not limit_to_top_proto_signature
254	or processed_crash["proto_signature"] == self.top_proto_signature
255	):
256	# TODO(investigate): maybe we should check if the stack is
257	# corrupted (ask gsvelto or willkg about how to detect that)
258	return processed_crash	×
259
260	raise NoCrashReportFoundError(	×
261	f"No crash report found with the most frequent proto signature for {self.signature_term}."
262	)
263
264
265	class SignaturesDataFetcher:	×
266	def __init__(	×
267	self,
268	signatures,
269	product: str = "Firefox",
270	channel: str = "nightly",
271	):
272	self._signatures = signatures	×
273	self._product = product	×
274	self._channel = channel	×
275
276	def fetch_clouseau_crash_reports(self) -> dict[str, list]:	×
277	return clouseau.Reports.get_by_signatures(	×
278	self._signatures,
279	product=self._product,
280	channel=self._channel,
281	)
282
283	def fetch_socorro_info(self) -> tuple[list[dict], int]:	×
284	# TODO(investigate): should we increase the duration to 6 months?
285	duration = timedelta(weeks=1)	×
286	end_date = lmdutils.get_date_ymd("today")	×
287	start_date = end_date - duration	×
288	date_range = socorro.SuperSearch.get_search_date(start_date, end_date)	×
289
290	params = {	×
291	"product": self._product,
292	# TODO(investigate): should we included all release channels?
293	"release_channel": self._channel,
294	# TODO(investigate): should we limit based on the build date as well?
295	"date": date_range,
296	# TODO: split signatures into chunks to avoid very long query URLs
297	"signature": ["=" + signature for signature in self._signatures],
298	"_aggs.signature": [
299	"build_id",
300	"cpu_arch",
301	"proto_signature",
302	"_cardinality.user_comments",
303	"cpu_arch",
304	"platform_pretty_version",
305	# The following are needed for SignatureStats:
306	"platform",
307	"is_garbage_collecting",
308	"_cardinality.install_time",
309	"startup_crash",
310	"_histogram.uptime",
311	"process_type",
312	],
313	"_results_number": 0,
314	"_facets_size": 10000,
315	}
316
317	def handler(search_results: dict, data: dict):	×
318	data["num_total_crashes"] = search_results["total"]	×
319	data["signatures"] = search_results["facets"]["signature"]	×
320
321	data: dict = {}	×
322	socorro.SuperSearchUnredacted(	×
323	params=params,
324	handler=handler,
325	handlerdata=data,
326	).wait()
327
328	return data["signatures"], data["num_total_crashes"]	×
329
330	def analyze(self) -> list[SignatureAnalyzer]:	×
331	clouseau_reports = self.fetch_clouseau_crash_reports()	×
332	signatures, num_total_crashes = self.fetch_socorro_info()	×
333
334	return [	×
335	SignatureAnalyzer(
336	signature,
337	num_total_crashes,
338	clouseau_reports[signature["term"]],
339	)
340	for signature in signatures
341	# TODO(investigate): For now, we are ignoring signatures that are
342	# not analyzed by clouseau. We should investigate why they are not
343	# analyzed and whether we should include them.
344	if signature["term"] in clouseau_reports
345	]

mozilla / relman-auto-nag / #4591

Source File Press 'n' to go to next uncovered line, 'b' for previous

Source File
Press 'n' to go to next uncovered line, 'b' for previous