#4595

pending completion

Build # #4595

Build Type

push

coveralls-python

Committed by

suhaibmujahid

Commit Message

Add a new rule to automatically file bugs for new actionable crashes

Run Details

646 of 3406 branches covered (18.97%)

465 of 465 new or added lines in 4 files covered. (100.0%)

1828 of 8469 relevant lines covered (21.58%)

0.22 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

0.0

/bugbot/crash/analyzer.py

# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this file,
# You can obtain one at http://mozilla.org/MPL/2.0/.

import itertools
from datetime import timedelta
from functools import cached_property
from typing import Iterable, Iterator

from libmozdata import bugzilla, clouseau, socorro
from libmozdata import utils as lmdutils

from bugbot.components import ComponentName
from bugbot.crash import socorro_util


class NoCrashReportFoundError(Exception):
    """Raised when no crash report is found with the required criteria."""


class ClouseauReportsAnalyzer:
    REGRESSOR_MINIMUM_SCORE: int = 8

    def __init__(self, reports: Iterable[dict]):
        self._clouseau_reports = reports

    @cached_property
    def max_score(self):
        if not self._clouseau_reports:
            return 0
        return max(report["max_score"] for report in self._clouseau_reports)

    @cached_property
    def regressed_by_potential_bug_ids(self) -> set[int]:
        minimum_accepted_score = max(self.REGRESSOR_MINIMUM_SCORE, self.max_score)
        return {
            changeset["bug_id"]
            for report in self._clouseau_reports
            if report["max_score"] >= minimum_accepted_score
            for changeset in report["changesets"]
            if changeset["max_score"] >= minimum_accepted_score
            and not changeset["is_merge"]
            and not changeset["is_backedout"]
        }

    @cached_property
    def regressed_by_patch(self) -> str | None:
        minimum_accepted_score = max(self.REGRESSOR_MINIMUM_SCORE, self.max_score)
        potential_patches = {
            changeset["changeset"]
            for report in self._clouseau_reports
            if report["max_score"] >= minimum_accepted_score
            for changeset in report["changesets"]
            if changeset["max_score"] >= minimum_accepted_score
            and not changeset["is_merge"]
            and not changeset["is_backedout"]
        }
        if len(potential_patches) == 1:
            return next(iter(potential_patches))
        return None

    @cached_property
    def regressed_by(self) -> int | None:
        bug_ids = self.regressed_by_potential_bug_ids
        if len(bug_ids) == 1:
            return next(iter(bug_ids))
        return None

    @cached_property
    def regressed_by_potential_bugs(self) -> list[dict]:
        def handler(bug: dict, data: list):
            data.append(bug)

        bugs: list[dict] = []
        bugzilla.Bugzilla(
            bugids=self.regressed_by_potential_bug_ids,
            include_fields=[
                "id",
                "assigned_to",
                "product",
                "component",
            ],
            bughandler=handler,
            bugdata=bugs,
        ).wait()

        return bugs

    @cached_property
    def regressed_by_author(self) -> dict | None:
        if not self.regressed_by:
            return None

        bug = self.regressed_by_potential_bugs[0]
        assert bug["id"] == self.regressed_by
        return bug["assigned_to_detail"]

    @cached_property
    def crash_component(self) -> ComponentName:
        potential_components = {
            ComponentName(bug["product"], bug["component"])
            for bug in self.regressed_by_potential_bugs
        }
        if len(potential_components) == 1:
            return next(iter(potential_components))
        return ComponentName("Core", "General")


class SocorroInfoAnalyzer(socorro_util.SignatureStats):
    __bugzilla_os_values = set(bugzilla.BugFields.fetch_field_values("op_sys"))
    __bugzilla_cpu_values = {
        value.lower(): value
        for value in bugzilla.BugFields.fetch_field_values("rep_platform")
    }

    @classmethod
    def to_bugzilla_op_sys(cls, op_sys: str) -> str:
        if op_sys in cls.__bugzilla_os_values:
            return op_sys

        if op_sys.startswith("OS X ") or op_sys.startswith("macOS "):
            op_sys = "macOS"
        elif op_sys.startswith("Windows"):
            op_sys = "Windows"
        elif "Linux" in op_sys or op_sys.startswith("Ubuntu"):
            op_sys = "Linux"
        else:
            op_sys = "Other"

        return op_sys

    @property
    def bugzilla_op_sys(self) -> str:
        all_op_sys = {
            self.to_bugzilla_op_sys(op_sys["term"])
            for op_sys in self.signature["facets"]["platform_pretty_version"]
        }

        if len(all_op_sys) > 1:
            # TODO: explain this workaround
            all_op_sys = {op_sys.split(" ")[0] for op_sys in all_op_sys}

        if len(all_op_sys) == 2 and "Other" in all_op_sys:
            all_op_sys.remove("Other")

        if len(all_op_sys) == 1:
            return next(iter(all_op_sys))

        if len(all_op_sys) == 0:
            return "Unspecified"

        return "All"

    @classmethod
    def to_bugzilla_cpu(cls, cpu: str) -> str:
        return cls.__bugzilla_cpu_values.get(cpu, "Other")

    @property
    def bugzilla_cpu_arch(self) -> str:
        all_cpu_arch = {
            self.to_bugzilla_cpu(cpu["term"])
            for cpu in self.signature["facets"]["cpu_arch"]
        }

        if len(all_cpu_arch) == 2 and "Other" in all_cpu_arch:
            all_cpu_arch.remove("Other")

        if len(all_cpu_arch) == 1:
            return next(iter(all_cpu_arch))

        if len(all_cpu_arch) == 0:
            return "Unspecified"

        return "All"

    @property
    def num_user_comments(self) -> int:
        # TODO: count useful/intrusting user comments (e.g., exclude one word comments)
        return self.signature["facets"]["cardinality_user_comments"]["value"]

    @property
    def has_user_comments(self) -> bool:
        return self.num_user_comments > 0

    @property
    def top_proto_signature(self) -> str:
        return self.signature["facets"]["proto_signature"][0]["term"]

    @property
    def num_top_proto_signature_crashes(self) -> int:
        return self.signature["facets"]["proto_signature"][0]["count"]

    @property
    def build_ids(self) -> Iterator[int]:
        for build_id in self.signature["facets"]["build_id"]:
            yield build_id["term"]

    @property
    def top_build_id(self) -> int:
        return self.signature["facets"]["build_id"][0]["term"]


class SignatureAnalyzer(SocorroInfoAnalyzer, ClouseauReportsAnalyzer):
    platforms = [
        {"short_name": "win", "name": "Windows"},
        {"short_name": "mac", "name": "Mac OS X"},
        {"short_name": "lin", "name": "Linux"},
        {"short_name": "and", "name": "Android"},
        {"short_name": "unknown", "name": "Unknown"},
    ]

    def __init__(
        self,
        signature: dict,
        num_total_crashes: int,
        clouseau_reports: list[dict],
    ):
        SocorroInfoAnalyzer.__init__(
            self, signature, num_total_crashes, platforms=self.platforms
        )
        ClouseauReportsAnalyzer.__init__(self, clouseau_reports)

    def _fetch_crash_reports(
        self,
        proto_signature: str,
        build_id: int | Iterable[int],
        limit: int = 1,
    ) -> Iterator[dict]:
        params = {
            "proto_signature": "=" + proto_signature,
            "build_id": build_id,
            "_columns": [
                "uuid",
            ],
            "_results_number": limit,
        }

        def handler(res: dict, data: dict):
            data.update(res)

        data: dict = {}
        socorro.SuperSearch(params=params, handler=handler, handlerdata=data).wait()

        yield from data["hits"]

    def fetch_representing_processed_crash(self) -> dict:
        limit_to_top_proto_signature = (
            self.num_top_proto_signature_crashes / self.num_crashes > 0.6
        )

        reports = itertools.chain(
            # Reports with a higher score from clouseau are more likely to be
            # useful.
            sorted(
                self._clouseau_reports,
                key=lambda report: report["max_score"],
                reverse=True,
            ),
            # Next we try find reports from the top crashing build because they
            # are likely to be representative.
            self._fetch_crash_reports(self.top_proto_signature, self.top_build_id),
            self._fetch_crash_reports(self.top_proto_signature, self.build_ids),
        )
        for report in reports:
            uuid = report["uuid"]
            processed_crash = socorro.ProcessedCrash.get_processed(uuid)[uuid]
            if (
                not limit_to_top_proto_signature
                or processed_crash["proto_signature"] == self.top_proto_signature
            ):
                # TODO(investigate): maybe we should check if the stack is
                # corrupted (ask gsvelto or willkg about how to detect that)
                return processed_crash

        raise NoCrashReportFoundError(
            f"No crash report found with the most frequent proto signature for {self.signature_term}."
        )


class SignaturesDataFetcher:
    def __init__(
        self,
        signatures,
        product: str = "Firefox",
        channel: str = "nightly",
    ):
        self._signatures = signatures
        self._product = product
        self._channel = channel

    def fetch_clouseau_crash_reports(self) -> dict[str, list]:
        return clouseau.Reports.get_by_signatures(
            self._signatures,
            product=self._product,
            channel=self._channel,
        )

    def fetch_socorro_info(self) -> tuple[list[dict], int]:
        # TODO(investigate): should we increase the duration to 6 months?
        duration = timedelta(weeks=1)
        end_date = lmdutils.get_date_ymd("today")
        start_date = end_date - duration
        date_range = socorro.SuperSearch.get_search_date(start_date, end_date)

        params = {
            "product": self._product,
            # TODO(investigate): should we included all release channels?
            "release_channel": self._channel,
            # TODO(investigate): should we limit based on the build date as well?
            "date": date_range,
            # TODO: split signatures into chunks to avoid very long query URLs
            "signature": ["=" + signature for signature in self._signatures],
            "_aggs.signature": [
                "build_id",
                "cpu_arch",
                "proto_signature",
                "_cardinality.user_comments",
                "cpu_arch",
                "platform_pretty_version",
                # The following are needed for SignatureStats:
                "platform",
                "is_garbage_collecting",
                "_cardinality.install_time",
                "startup_crash",
                "_histogram.uptime",
                "process_type",
            ],
            "_results_number": 0,
            "_facets_size": 10000,
        }

        def handler(search_results: dict, data: dict):
            data["num_total_crashes"] = search_results["total"]
            data["signatures"] = search_results["facets"]["signature"]

        data: dict = {}
        socorro.SuperSearchUnredacted(
            params=params,
            handler=handler,
            handlerdata=data,
        ).wait()

        return data["signatures"], data["num_total_crashes"]

    def analyze(self) -> list[SignatureAnalyzer]:
        clouseau_reports = self.fetch_clouseau_crash_reports()
        signatures, num_total_crashes = self.fetch_socorro_info()

        return [
            SignatureAnalyzer(
                signature,
                num_total_crashes,
                clouseau_reports[signature["term"]],
            )
            for signature in signatures
            # TODO(investigate): For now, we are ignoring signatures that are
            # not analyzed by clouseau. We should investigate why they are not
            # analyzed and whether we should include them.
            if signature["term"] in clouseau_reports
        ]

1	# This Source Code Form is subject to the terms of the Mozilla Public
2	# License, v. 2.0. If a copy of the MPL was not distributed with this file,
3	# You can obtain one at http://mozilla.org/MPL/2.0/.
4
5	import itertools	×
6	from datetime import timedelta	×
7	from functools import cached_property	×
8	from typing import Iterable, Iterator	×
9
10	from libmozdata import bugzilla, clouseau, socorro	×
11	from libmozdata import utils as lmdutils	×
12
13	from bugbot.components import ComponentName	×
14	from bugbot.crash import socorro_util	×
15
16
17	class NoCrashReportFoundError(Exception):	×
18	"""Raised when no crash report is found with the required criteria."""
19
20
21	class ClouseauReportsAnalyzer:	×
22	REGRESSOR_MINIMUM_SCORE: int = 8	×
23
24	def __init__(self, reports: Iterable[dict]):	×
25	self._clouseau_reports = reports	×
26
27	@cached_property	×
28	def max_score(self):	×
29	if not self._clouseau_reports:	×
30	return 0	×
31	return max(report["max_score"] for report in self._clouseau_reports)	×
32
33	@cached_property	×
34	def regressed_by_potential_bug_ids(self) -> set[int]:	×
35	minimum_accepted_score = max(self.REGRESSOR_MINIMUM_SCORE, self.max_score)	×
36	return {	×
37	changeset["bug_id"]
38	for report in self._clouseau_reports
39	if report["max_score"] >= minimum_accepted_score
40	for changeset in report["changesets"]
41	if changeset["max_score"] >= minimum_accepted_score
42	and not changeset["is_merge"]
43	and not changeset["is_backedout"]
44	}
45
46	@cached_property	×
47	def regressed_by_patch(self) -> str \| None:	×
48	minimum_accepted_score = max(self.REGRESSOR_MINIMUM_SCORE, self.max_score)	×
49	potential_patches = {	×
50	changeset["changeset"]
51	for report in self._clouseau_reports
52	if report["max_score"] >= minimum_accepted_score
53	for changeset in report["changesets"]
54	if changeset["max_score"] >= minimum_accepted_score
55	and not changeset["is_merge"]
56	and not changeset["is_backedout"]
57	}
58	if len(potential_patches) == 1:	×
59	return next(iter(potential_patches))	×
60	return None	×
61
62	@cached_property	×
63	def regressed_by(self) -> int \| None:	×
64	bug_ids = self.regressed_by_potential_bug_ids	×
65	if len(bug_ids) == 1:	×
66	return next(iter(bug_ids))	×
67	return None	×
68
69	@cached_property	×
70	def regressed_by_potential_bugs(self) -> list[dict]:	×
71	def handler(bug: dict, data: list):	×
72	data.append(bug)	×
73
74	bugs: list[dict] = []	×
75	bugzilla.Bugzilla(	×
76	bugids=self.regressed_by_potential_bug_ids,
77	include_fields=[
78	"id",
79	"assigned_to",
80	"product",
81	"component",
82	],
83	bughandler=handler,
84	bugdata=bugs,
85	).wait()
86
87	return bugs	×
88
89	@cached_property	×
90	def regressed_by_author(self) -> dict \| None:	×
91	if not self.regressed_by:	×
92	return None	×
93
94	bug = self.regressed_by_potential_bugs[0]	×
95	assert bug["id"] == self.regressed_by	×
96	return bug["assigned_to_detail"]	×
97
98	@cached_property	×
99	def crash_component(self) -> ComponentName:	×
100	potential_components = {	×
101	ComponentName(bug["product"], bug["component"])
102	for bug in self.regressed_by_potential_bugs
103	}
104	if len(potential_components) == 1:	×
105	return next(iter(potential_components))	×
106	return ComponentName("Core", "General")	×
107
108
109	class SocorroInfoAnalyzer(socorro_util.SignatureStats):	×
110	__bugzilla_os_values = set(bugzilla.BugFields.fetch_field_values("op_sys"))	×
111	__bugzilla_cpu_values = {	×
112	value.lower(): value
113	for value in bugzilla.BugFields.fetch_field_values("rep_platform")
114	}
115
116	@classmethod	×
117	def to_bugzilla_op_sys(cls, op_sys: str) -> str:	×
118	if op_sys in cls.__bugzilla_os_values:	×
119	return op_sys	×
120
121	if op_sys.startswith("OS X ") or op_sys.startswith("macOS "):	×
122	op_sys = "macOS"	×
123	elif op_sys.startswith("Windows"):	×
124	op_sys = "Windows"	×
125	elif "Linux" in op_sys or op_sys.startswith("Ubuntu"):	×
126	op_sys = "Linux"	×
127	else:
128	op_sys = "Other"	×
129
130	return op_sys	×
131
132	@property	×
133	def bugzilla_op_sys(self) -> str:	×
134	all_op_sys = {	×
135	self.to_bugzilla_op_sys(op_sys["term"])
136	for op_sys in self.signature["facets"]["platform_pretty_version"]
137	}
138
139	if len(all_op_sys) > 1:	×
140	# TODO: explain this workaround
141	all_op_sys = {op_sys.split(" ")[0] for op_sys in all_op_sys}	×
142
143	if len(all_op_sys) == 2 and "Other" in all_op_sys:	×
144	all_op_sys.remove("Other")	×
145
146	if len(all_op_sys) == 1:	×
147	return next(iter(all_op_sys))	×
148
149	if len(all_op_sys) == 0:	×
150	return "Unspecified"	×
151
152	return "All"	×
153
154	@classmethod	×
155	def to_bugzilla_cpu(cls, cpu: str) -> str:	×
156	return cls.__bugzilla_cpu_values.get(cpu, "Other")	×
157
158	@property	×
159	def bugzilla_cpu_arch(self) -> str:	×
160	all_cpu_arch = {	×
161	self.to_bugzilla_cpu(cpu["term"])
162	for cpu in self.signature["facets"]["cpu_arch"]
163	}
164
165	if len(all_cpu_arch) == 2 and "Other" in all_cpu_arch:	×
166	all_cpu_arch.remove("Other")	×
167
168	if len(all_cpu_arch) == 1:	×
169	return next(iter(all_cpu_arch))	×
170
171	if len(all_cpu_arch) == 0:	×
172	return "Unspecified"	×
173
174	return "All"	×
175
176	@property	×
177	def num_user_comments(self) -> int:	×
178	# TODO: count useful/intrusting user comments (e.g., exclude one word comments)
179	return self.signature["facets"]["cardinality_user_comments"]["value"]	×
180
181	@property	×
182	def has_user_comments(self) -> bool:	×
183	return self.num_user_comments > 0	×
184
185	@property	×
186	def top_proto_signature(self) -> str:	×
187	return self.signature["facets"]["proto_signature"][0]["term"]	×
188
189	@property	×
190	def num_top_proto_signature_crashes(self) -> int:	×
191	return self.signature["facets"]["proto_signature"][0]["count"]	×
192
193	@property	×
194	def build_ids(self) -> Iterator[int]:	×
195	for build_id in self.signature["facets"]["build_id"]:	×
196	yield build_id["term"]	×
197
198	@property	×
199	def top_build_id(self) -> int:	×
200	return self.signature["facets"]["build_id"][0]["term"]	×
201
202
203	class SignatureAnalyzer(SocorroInfoAnalyzer, ClouseauReportsAnalyzer):	×
204	platforms = [	×
205	{"short_name": "win", "name": "Windows"},
206	{"short_name": "mac", "name": "Mac OS X"},
207	{"short_name": "lin", "name": "Linux"},
208	{"short_name": "and", "name": "Android"},
209	{"short_name": "unknown", "name": "Unknown"},
210	]
211
212	def __init__(	×
213	self,
214	signature: dict,
215	num_total_crashes: int,
216	clouseau_reports: list[dict],
217	):
218	SocorroInfoAnalyzer.__init__(	×
219	self, signature, num_total_crashes, platforms=self.platforms
220	)
221	ClouseauReportsAnalyzer.__init__(self, clouseau_reports)	×
222
223	def _fetch_crash_reports(	×
224	self,
225	proto_signature: str,
226	build_id: int \| Iterable[int],
227	limit: int = 1,
228	) -> Iterator[dict]:
229	params = {	×
230	"proto_signature": "=" + proto_signature,
231	"build_id": build_id,
232	"_columns": [
233	"uuid",
234	],
235	"_results_number": limit,
236	}
237
238	def handler(res: dict, data: dict):	×
239	data.update(res)	×
240
241	data: dict = {}	×
242	socorro.SuperSearch(params=params, handler=handler, handlerdata=data).wait()	×
243
244	yield from data["hits"]	×
245
246	def fetch_representing_processed_crash(self) -> dict:	×
247	limit_to_top_proto_signature = (	×
248	self.num_top_proto_signature_crashes / self.num_crashes > 0.6
249	)
250
251	reports = itertools.chain(	×
252	# Reports with a higher score from clouseau are more likely to be
253	# useful.
254	sorted(
255	self._clouseau_reports,
256	key=lambda report: report["max_score"],
257	reverse=True,
258	),
259	# Next we try find reports from the top crashing build because they
260	# are likely to be representative.
261	self._fetch_crash_reports(self.top_proto_signature, self.top_build_id),
262	self._fetch_crash_reports(self.top_proto_signature, self.build_ids),
263	)
264	for report in reports:	×
265	uuid = report["uuid"]	×
266	processed_crash = socorro.ProcessedCrash.get_processed(uuid)[uuid]	×
267	if (	×
268	not limit_to_top_proto_signature
269	or processed_crash["proto_signature"] == self.top_proto_signature
270	):
271	# TODO(investigate): maybe we should check if the stack is
272	# corrupted (ask gsvelto or willkg about how to detect that)
273	return processed_crash	×
274
275	raise NoCrashReportFoundError(	×
276	f"No crash report found with the most frequent proto signature for {self.signature_term}."
277	)
278
279
280	class SignaturesDataFetcher:	×
281	def __init__(	×
282	self,
283	signatures,
284	product: str = "Firefox",
285	channel: str = "nightly",
286	):
287	self._signatures = signatures	×
288	self._product = product	×
289	self._channel = channel	×
290
291	def fetch_clouseau_crash_reports(self) -> dict[str, list]:	×
292	return clouseau.Reports.get_by_signatures(	×
293	self._signatures,
294	product=self._product,
295	channel=self._channel,
296	)
297
298	def fetch_socorro_info(self) -> tuple[list[dict], int]:	×
299	# TODO(investigate): should we increase the duration to 6 months?
300	duration = timedelta(weeks=1)	×
301	end_date = lmdutils.get_date_ymd("today")	×
302	start_date = end_date - duration	×
303	date_range = socorro.SuperSearch.get_search_date(start_date, end_date)	×
304
305	params = {	×
306	"product": self._product,
307	# TODO(investigate): should we included all release channels?
308	"release_channel": self._channel,
309	# TODO(investigate): should we limit based on the build date as well?
310	"date": date_range,
311	# TODO: split signatures into chunks to avoid very long query URLs
312	"signature": ["=" + signature for signature in self._signatures],
313	"_aggs.signature": [
314	"build_id",
315	"cpu_arch",
316	"proto_signature",
317	"_cardinality.user_comments",
318	"cpu_arch",
319	"platform_pretty_version",
320	# The following are needed for SignatureStats:
321	"platform",
322	"is_garbage_collecting",
323	"_cardinality.install_time",
324	"startup_crash",
325	"_histogram.uptime",
326	"process_type",
327	],
328	"_results_number": 0,
329	"_facets_size": 10000,
330	}
331
332	def handler(search_results: dict, data: dict):	×
333	data["num_total_crashes"] = search_results["total"]	×
334	data["signatures"] = search_results["facets"]["signature"]	×
335
336	data: dict = {}	×
337	socorro.SuperSearchUnredacted(	×
338	params=params,
339	handler=handler,
340	handlerdata=data,
341	).wait()
342
343	return data["signatures"], data["num_total_crashes"]	×
344
345	def analyze(self) -> list[SignatureAnalyzer]:	×
346	clouseau_reports = self.fetch_clouseau_crash_reports()	×
347	signatures, num_total_crashes = self.fetch_socorro_info()	×
348
349	return [	×
350	SignatureAnalyzer(
351	signature,
352	num_total_crashes,
353	clouseau_reports[signature["term"]],
354	)
355	for signature in signatures
356	# TODO(investigate): For now, we are ignoring signatures that are
357	# not analyzed by clouseau. We should investigate why they are not
358	# analyzed and whether we should include them.
359	if signature["term"] in clouseau_reports
360	]

mozilla / relman-auto-nag / #4595

Source File Press 'n' to go to next uncovered line, 'b' for previous

Source File
Press 'n' to go to next uncovered line, 'b' for previous