index/matchtree.go at main · boltless.me/zoekt

fork of https://github.com/sourcegraph/zoekt
zoekt / index / matchtree.go
at main 36 kB View raw
Dmitry Gruzd index: add hybrid go-re2 engine for large file content matching (#1024) 3mo ago
   1// Copyright 2018 Google Inc. All rights reserved.
   2//
   3// Licensed under the Apache License, Version 2.0 (the "License");
   4// you may not use this file except in compliance with the License.
   5// You may obtain a copy of the License at
   6//
   7//    http://www.apache.org/licenses/LICENSE-2.0
   8//
   9// Unless required by applicable law or agreed to in writing, software
  10// distributed under the License is distributed on an "AS IS" BASIS,
  11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12// See the License for the specific language governing permissions and
  13// limitations under the License.
  14
  15package index
  16
  17import (
  18	"bytes"
  19	"fmt"
  20	"log"
  21	"math"
  22	"regexp/syntax"
  23	"strings"
  24	"unicode/utf8"
  25
  26	"github.com/cespare/xxhash/v2"
  27	"github.com/grafana/regexp"
  28
  29	"github.com/sourcegraph/zoekt"
  30	"github.com/sourcegraph/zoekt/internal/hybridre2"
  31	"github.com/sourcegraph/zoekt/internal/syntaxutil"
  32	"github.com/sourcegraph/zoekt/query"
  33)
  34
  35// A docIterator iterates over documents in order.
  36type docIterator interface {
  37	// provide the next document where we may find something interesting.
  38	//
  39	// This is like a "peek" and shouldn't mutate state. prepare is what should
  40	// change state.
  41	nextDoc() uint32
  42
  43	// clears any per-document state of the docIterator, and
  44	// prepares for evaluating the given doc. The argument is
  45	// strictly increasing over time.
  46	prepare(nextDoc uint32)
  47}
  48
  49// costs are passed in increasing order to matchTree.matches until they do not
  50// return matchesRequiresHigherCost.
  51const (
  52	costConst   = 0
  53	costMemory  = 1
  54	costContent = 2
  55	costRegexp  = 3
  56)
  57
  58const (
  59	costMin = costConst
  60	costMax = costRegexp
  61)
  62
  63// matchesState is an enum for the state of a matchTree after a call to
  64// matchTree.matches.
  65type matchesState uint8
  66
  67const (
  68	// matchesRequiresHigherCost is returned when matchTree.matches hasn't done
  69	// a search yet since the cost value is not high enough.
  70	matchesRequiresHigherCost matchesState = iota
  71
  72	// matchesFound is returned when matchTree.matches has done a search and
  73	// found one or more matches.
  74	matchesFound
  75
  76	// matchesNone is returned when matchTree.matches has done a search and
  77	// found nothing.
  78	matchesNone
  79)
  80
  81// matchesStatePred is a helper which returns matchesFound if b is true
  82// otherwise returns matchesNone.
  83func matchesStatePred(b bool) matchesState {
  84	if b {
  85		return matchesFound
  86	}
  87	return matchesNone
  88}
  89
  90// matchesStateForSlice is a helper which returns matchesFound if v is
  91// non-empty otherwise returns matchesNone.
  92func matchesStateForSlice[T any](v []T) matchesState {
  93	return matchesStatePred(len(v) > 0)
  94}
  95
  96// An expression tree coupled with matches. The matchtree has two
  97// functions:
  98//
  99// * it implements boolean combinations (and, or, not)
 100//
 101// * it implements shortcuts, where we skip documents (for example: if
 102// there are no trigram matches, we can be sure there are no substring
 103// matches). The matchtree iterates over the documents as they are
 104// ordered in the shard.
 105//
 106// The general process for a given (shard, query) is:
 107//
 108// - construct matchTree for the query
 109//
 110// - find all different leaf matchTrees (substring, regexp, etc.)
 111//
 112// in a loop:
 113//
 114//   - find next doc to process using nextDoc
 115//
 116//   - evaluate atoms (leaf expressions that match text)
 117//
 118//   - evaluate the tree using matches(), storing the result in map.
 119//
 120//   - if the complete tree returns (matches() != matchesRequiresHigherCost)
 121//     for the document, collect all text matches by looking at leaf
 122//     matchTrees.
 123type matchTree interface {
 124	docIterator
 125
 126	// matches if cost is high enough. See documentation for matchesState's
 127	// values.
 128	//
 129	// Note: Do not call this directly, rather use evalMatchTree which uses
 130	// known to cache responses once the state transitions away from
 131	// matchesRequiresHigherCost.
 132	matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState
 133}
 134
 135// docMatchTree iterates over documents for which predicate(docID) returns true.
 136type docMatchTree struct {
 137	// the number of documents in a shard.
 138	numDocs uint32
 139
 140	predicate func(docID uint32) bool
 141
 142	// provides additional information about the reason why the docMatchTree was
 143	// created.
 144	reason string
 145
 146	// mutable
 147	firstDone bool
 148	docID     uint32
 149}
 150
 151type bruteForceMatchTree struct {
 152	// mutable
 153	firstDone bool
 154	docID     uint32
 155}
 156
 157type andLineMatchTree struct {
 158	andMatchTree
 159}
 160
 161type andMatchTree struct {
 162	children []matchTree
 163}
 164
 165type orMatchTree struct {
 166	children []matchTree
 167}
 168
 169type notMatchTree struct {
 170	child matchTree
 171}
 172
 173// Returns only the filename of child matches.
 174type fileNameMatchTree struct {
 175	child matchTree
 176}
 177
 178type boostMatchTree struct {
 179	child matchTree
 180	boost float64
 181}
 182
 183// Don't visit this subtree for collecting matches.
 184type noVisitMatchTree struct {
 185	matchTree
 186}
 187
 188type regexpMatchTree struct {
 189	regexp *regexp.Regexp
 190
 191	// hybridRegexp is a size-aware wrapper that dispatches to go-re2 for
 192	// large file content when ZOEKT_RE2_THRESHOLD_BYTES is configured.
 193	// For small inputs and filename matches, regexp is used directly.
 194	hybridRegexp *hybridre2.Regexp
 195
 196	// origRegexp is the original parsed regexp from the query structure. It
 197	// does not include mutations such as case sensitivity.
 198	origRegexp *syntax.Regexp
 199
 200	fileName bool
 201
 202	// mutable
 203	reEvaluated bool
 204	found       []*candidateMatch
 205
 206	// nextDoc, prepare.
 207	bruteForceMatchTree
 208}
 209
 210func newRegexpMatchTree(s *query.Regexp) *regexpMatchTree {
 211	prefix := ""
 212	if !s.CaseSensitive {
 213		prefix = "(?i)"
 214	}
 215
 216	pattern := prefix + syntaxutil.RegexpString(s.Regexp)
 217
 218	// hybridRegexp is only used for file content matching; skip the RE2
 219	// compilation overhead for filename-only regexps.
 220	var hr *hybridre2.Regexp
 221	if !s.FileName {
 222		hr = hybridre2.MustCompile(pattern)
 223	}
 224	return &regexpMatchTree{
 225		regexp:       regexp.MustCompile(pattern),
 226		hybridRegexp: hr,
 227		origRegexp:   s.Regexp,
 228		fileName:     s.FileName,
 229	}
 230}
 231
 232// \bLITERAL\b
 233type wordMatchTree struct {
 234	word string
 235
 236	fileName bool
 237
 238	// mutable
 239	evaluated bool
 240	found     []*candidateMatch
 241
 242	// nextDoc, prepare.
 243	bruteForceMatchTree
 244}
 245
 246type substrMatchTree struct {
 247	matchIterator
 248
 249	query         *query.Substring
 250	caseSensitive bool
 251	fileName      bool
 252
 253	// mutable
 254	current       []*candidateMatch
 255	contEvaluated bool
 256}
 257
 258type branchQueryMatchTree struct {
 259	fileMasks []uint64
 260	masks     []uint64
 261	repos     []uint16
 262
 263	// mutable
 264	firstDone bool
 265	docID     uint32
 266}
 267
 268func (t *branchQueryMatchTree) branchMask() uint64 {
 269	return t.fileMasks[t.docID] & t.masks[t.repos[t.docID]]
 270}
 271
 272type symbolRegexpMatchTree struct {
 273	matchTree
 274	regexp *regexp.Regexp
 275	all    bool // skips regex match if .*
 276
 277	reEvaluated bool
 278	found       []*candidateMatch
 279}
 280
 281func (t *symbolRegexpMatchTree) prepare(doc uint32) {
 282	t.reEvaluated = false
 283	t.found = t.found[:0]
 284	t.matchTree.prepare(doc)
 285}
 286
 287func (t *symbolRegexpMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
 288	if t.reEvaluated {
 289		return matchesStateForSlice(t.found)
 290	}
 291
 292	if cost < costRegexp {
 293		return matchesRequiresHigherCost
 294	}
 295
 296	sections := cp.docSections()
 297	content := cp.data(false)
 298
 299	found := t.found[:0]
 300	for i, sec := range sections {
 301		var idx []int
 302		if t.all {
 303			idx = []int{0, int(sec.End - sec.Start)}
 304		} else {
 305			idx = t.regexp.FindIndex(content[sec.Start:sec.End])
 306			if idx == nil {
 307				continue
 308			}
 309		}
 310
 311		cm := &candidateMatch{
 312			byteOffset:  sec.Start + uint32(idx[0]),
 313			byteMatchSz: uint32(idx[1] - idx[0]),
 314			symbol:      true,
 315			symbolIdx:   uint32(i),
 316		}
 317		found = append(found, cm)
 318	}
 319	t.found = found
 320	t.reEvaluated = true
 321
 322	return matchesStateForSlice(t.found)
 323}
 324
 325type symbolSubstrMatchTree struct {
 326	*substrMatchTree
 327
 328	patternSize   uint32
 329	fileEndRunes  []uint32
 330	fileEndSymbol []uint32
 331
 332	doc      uint32
 333	sections []DocumentSection
 334
 335	secID uint32
 336}
 337
 338func (t *symbolSubstrMatchTree) prepare(doc uint32) {
 339	t.substrMatchTree.prepare(doc)
 340	t.doc = doc
 341
 342	var fileStart uint32
 343	if doc > 0 {
 344		fileStart = t.fileEndRunes[doc-1]
 345	}
 346
 347	var sections []DocumentSection
 348	if len(t.sections) > 0 {
 349		most := t.fileEndSymbol[len(t.fileEndSymbol)-1]
 350		if most == uint32(len(t.sections)) {
 351			sections = t.sections[t.fileEndSymbol[doc]:t.fileEndSymbol[doc+1]]
 352		} else {
 353			for t.secID < uint32(len(t.sections)) && t.sections[t.secID].Start < fileStart {
 354				t.secID++
 355			}
 356
 357			fileEnd, symbolEnd := t.fileEndRunes[doc], t.secID
 358			for symbolEnd < uint32(len(t.sections)) && t.sections[symbolEnd].Start < fileEnd {
 359				symbolEnd++
 360			}
 361
 362			sections = t.sections[t.secID:symbolEnd]
 363		}
 364	}
 365
 366	secIdx := 0
 367	trimmed := t.current[:0]
 368	for len(sections) > secIdx && len(t.current) > 0 {
 369		start := fileStart + t.current[0].runeOffset
 370		end := start + t.patternSize
 371		if start >= sections[secIdx].End {
 372			secIdx++
 373			continue
 374		}
 375
 376		if start < sections[secIdx].Start {
 377			t.current = t.current[1:]
 378			continue
 379		}
 380
 381		if end <= sections[secIdx].End {
 382			t.current[0].symbol = true
 383			t.current[0].symbolIdx = uint32(secIdx)
 384			trimmed = append(trimmed, t.current[0])
 385		}
 386
 387		t.current = t.current[1:]
 388	}
 389	t.current = trimmed
 390}
 391
 392// all prepare methods
 393
 394func (t *bruteForceMatchTree) prepare(doc uint32) {
 395	t.docID = doc
 396	t.firstDone = true
 397}
 398
 399func (t *docMatchTree) prepare(doc uint32) {
 400	t.docID = doc
 401	t.firstDone = true
 402}
 403
 404func (t *andMatchTree) prepare(doc uint32) {
 405	for _, c := range t.children {
 406		c.prepare(doc)
 407	}
 408}
 409
 410func (t *regexpMatchTree) prepare(doc uint32) {
 411	t.found = t.found[:0]
 412	t.reEvaluated = false
 413	t.bruteForceMatchTree.prepare(doc)
 414}
 415
 416func (t *wordMatchTree) prepare(doc uint32) {
 417	t.found = t.found[:0]
 418	t.evaluated = false
 419	t.bruteForceMatchTree.prepare(doc)
 420}
 421
 422func (t *orMatchTree) prepare(doc uint32) {
 423	for _, c := range t.children {
 424		c.prepare(doc)
 425	}
 426}
 427
 428func (t *notMatchTree) prepare(doc uint32) {
 429	t.child.prepare(doc)
 430}
 431
 432func (t *fileNameMatchTree) prepare(doc uint32) {
 433	t.child.prepare(doc)
 434}
 435
 436func (t *boostMatchTree) prepare(doc uint32) {
 437	t.child.prepare(doc)
 438}
 439
 440func (t *substrMatchTree) prepare(nextDoc uint32) {
 441	t.matchIterator.prepare(nextDoc)
 442	t.current = t.matchIterator.candidates()
 443	t.contEvaluated = false
 444}
 445
 446func (t *branchQueryMatchTree) prepare(doc uint32) {
 447	t.firstDone = true
 448	t.docID = doc
 449}
 450
 451// nextDoc
 452
 453func (t *docMatchTree) nextDoc() uint32 {
 454	var start uint32
 455	if t.firstDone {
 456		start = t.docID + 1
 457	}
 458	for i := start; i < t.numDocs; i++ {
 459		if t.predicate(i) {
 460			return i
 461		}
 462	}
 463	return math.MaxUint32
 464}
 465
 466func (t *bruteForceMatchTree) nextDoc() uint32 {
 467	if !t.firstDone {
 468		return 0
 469	}
 470	return t.docID + 1
 471}
 472
 473func (t *andMatchTree) nextDoc() uint32 {
 474	var max uint32
 475	for _, c := range t.children {
 476		m := c.nextDoc()
 477		if m > max {
 478			max = m
 479		}
 480	}
 481	return max
 482}
 483
 484func (t *orMatchTree) nextDoc() uint32 {
 485	min := uint32(math.MaxUint32)
 486	for _, c := range t.children {
 487		m := c.nextDoc()
 488		if m < min {
 489			min = m
 490		}
 491	}
 492	return min
 493}
 494
 495func (t *notMatchTree) nextDoc() uint32 {
 496	return 0
 497}
 498
 499func (t *fileNameMatchTree) nextDoc() uint32 {
 500	return t.child.nextDoc()
 501}
 502
 503func (t *boostMatchTree) nextDoc() uint32 {
 504	return t.child.nextDoc()
 505}
 506
 507func (t *branchQueryMatchTree) nextDoc() uint32 {
 508	var start uint32
 509	if t.firstDone {
 510		start = t.docID + 1
 511	}
 512
 513	for i := start; i < uint32(len(t.fileMasks)); i++ {
 514		if (t.masks[t.repos[i]] & t.fileMasks[i]) != 0 {
 515			return i
 516		}
 517	}
 518	return math.MaxUint32
 519}
 520
 521// all String methods
 522
 523func (t *bruteForceMatchTree) String() string {
 524	return "all"
 525}
 526
 527func (t *docMatchTree) String() string {
 528	return fmt.Sprintf("doc(%s)", t.reason)
 529}
 530
 531func (t *andMatchTree) String() string {
 532	return fmt.Sprintf("and%v", t.children)
 533}
 534
 535func (t *regexpMatchTree) String() string {
 536	f := ""
 537	if t.fileName {
 538		f = "f"
 539	}
 540	return fmt.Sprintf("%sre(%s)", f, t.regexp)
 541}
 542
 543func (t *wordMatchTree) String() string {
 544	f := ""
 545	if t.fileName {
 546		f = "f"
 547	}
 548	return fmt.Sprintf("%sword(%s)", f, t.word)
 549}
 550
 551func (t *orMatchTree) String() string {
 552	return fmt.Sprintf("or%v", t.children)
 553}
 554
 555func (t *notMatchTree) String() string {
 556	return fmt.Sprintf("not(%v)", t.child)
 557}
 558
 559func (t *noVisitMatchTree) String() string {
 560	return fmt.Sprintf("novisit(%v)", t.matchTree)
 561}
 562
 563func (t *fileNameMatchTree) String() string {
 564	return fmt.Sprintf("f(%v)", t.child)
 565}
 566
 567func (t *boostMatchTree) String() string {
 568	return fmt.Sprintf("boost(%f, %v)", t.boost, t.child)
 569}
 570
 571func (t *substrMatchTree) String() string {
 572	f := ""
 573	if t.fileName {
 574		f = "f"
 575	}
 576
 577	return fmt.Sprintf("%ssubstr(%q, %v, %v)", f, t.query.Pattern, t.current, t.matchIterator)
 578}
 579
 580func (t *branchQueryMatchTree) String() string {
 581	return fmt.Sprintf("branch(%x)", t.masks)
 582}
 583
 584func (t *symbolSubstrMatchTree) String() string {
 585	return fmt.Sprintf("symbol(%v)", t.substrMatchTree)
 586}
 587
 588func (t *symbolRegexpMatchTree) String() string {
 589	return fmt.Sprintf("symbol(%v)", t.matchTree)
 590}
 591
 592// visitMatches visits all atoms in matchTree. Note: This visits
 593// noVisitMatchTree. For collecting matches use visitMatches.
 594func visitMatchTree(t matchTree, f func(matchTree)) {
 595	switch s := t.(type) {
 596	case *andMatchTree:
 597		for _, ch := range s.children {
 598			visitMatchTree(ch, f)
 599		}
 600	case *orMatchTree:
 601		for _, ch := range s.children {
 602			visitMatchTree(ch, f)
 603		}
 604	case *andLineMatchTree:
 605		visitMatchTree(&s.andMatchTree, f)
 606	case *noVisitMatchTree:
 607		visitMatchTree(s.matchTree, f)
 608	case *notMatchTree:
 609		visitMatchTree(s.child, f)
 610	case *fileNameMatchTree:
 611		visitMatchTree(s.child, f)
 612	case *boostMatchTree:
 613		visitMatchTree(s.child, f)
 614	case *symbolSubstrMatchTree:
 615		visitMatchTree(s.substrMatchTree, f)
 616	case *symbolRegexpMatchTree:
 617		visitMatchTree(s.matchTree, f)
 618	default:
 619		f(t)
 620	}
 621}
 622
 623// updateMatchTreeStats calls updateStats on all atoms in mt which have that
 624// function defined.
 625func updateMatchTreeStats(mt matchTree, stats *zoekt.Stats) {
 626	visitMatchTree(mt, func(mt matchTree) {
 627		if atom, ok := mt.(interface{ updateStats(*zoekt.Stats) }); ok {
 628			atom.updateStats(stats)
 629		}
 630	})
 631}
 632
 633func visitMatchAtoms(t matchTree, known map[matchTree]bool, f func(matchTree)) {
 634	visitMatches(t, known, 1, func(mt matchTree, _ float64) {
 635		f(mt)
 636	})
 637}
 638
 639// visitMatches visits all atoms which can contribute matches. Note: This
 640// skips noVisitMatchTree.
 641func visitMatches(t matchTree, known map[matchTree]bool, weight float64, f func(matchTree, float64)) {
 642	switch s := t.(type) {
 643	case *andMatchTree:
 644		for _, ch := range s.children {
 645			if known[ch] {
 646				visitMatches(ch, known, weight, f)
 647			}
 648		}
 649	case *andLineMatchTree:
 650		visitMatches(&s.andMatchTree, known, weight, f)
 651	case *orMatchTree:
 652		for _, ch := range s.children {
 653			if known[ch] {
 654				visitMatches(ch, known, weight, f)
 655			}
 656		}
 657	case *boostMatchTree:
 658		visitMatches(s.child, known, weight*s.boost, f)
 659	case *symbolSubstrMatchTree:
 660		visitMatches(s.substrMatchTree, known, weight, f)
 661	case *notMatchTree:
 662	case *noVisitMatchTree:
 663		// don't collect into negative trees.
 664	case *fileNameMatchTree:
 665		// We will just gather the filename if we do not visit this tree.
 666	default:
 667		f(s, weight)
 668	}
 669}
 670
 671// all matches() methods.
 672
 673func (t *docMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
 674	return matchesStatePred(t.predicate(cp.idx))
 675}
 676
 677func (t *bruteForceMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
 678	return matchesFound
 679}
 680
 681// andLineMatchTree is a performance optimization of andMatchTree. For content
 682// searches we don't want to run the regex engine if there is no line that
 683// contains matches from all terms.
 684func (t *andLineMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
 685	if state := evalMatchTree(cp, cost, known, &t.andMatchTree); state != matchesFound {
 686		return state
 687	}
 688
 689	// Invariant: all children have matches. If any line contains all of them we
 690	// can return MatchesFound.
 691
 692	// find child with fewest candidates
 693	minCount := math.MaxInt
 694	fewestChildren := 0
 695	for ix, child := range t.children {
 696		v, ok := child.(*substrMatchTree)
 697		// make sure we are running a content search and that all candidates are a
 698		// substrMatchTree
 699		if !ok || v.fileName {
 700			return matchesFound
 701		}
 702		if len(v.current) < minCount {
 703			minCount = len(v.current)
 704			fewestChildren = ix
 705		}
 706	}
 707
 708	type lineRange struct {
 709		start int
 710		end   int
 711	}
 712	lines := make([]lineRange, 0, len(t.children[fewestChildren].(*substrMatchTree).current))
 713	prev := -1
 714	for _, candidate := range t.children[fewestChildren].(*substrMatchTree).current {
 715		line := cp.newlines().atOffset(candidate.byteOffset)
 716		if line == prev {
 717			continue
 718		}
 719		prev = line
 720		byteStart := int(cp.newlines().lineStart(line))
 721		byteEnd := int(cp.newlines().lineStart(line + 1))
 722		lines = append(lines, lineRange{byteStart, byteEnd})
 723	}
 724
 725	// children keeps track of the children's candidates we have already seen.
 726	children := make([][]*candidateMatch, 0, len(t.children)-1)
 727	for j, child := range t.children {
 728		if j == fewestChildren {
 729			continue
 730		}
 731		children = append(children, child.(*substrMatchTree).current)
 732	}
 733
 734nextLine:
 735	for i := 0; i < len(lines); i++ {
 736		hits := 1
 737	nextChild:
 738		for j := range children {
 739		nextCandidate:
 740			for len(children[j]) > 0 {
 741				candidate := children[j][0]
 742				bo := int(cp.findOffset(false, candidate.runeOffset))
 743				if bo < lines[i].start {
 744					children[j] = children[j][1:]
 745					continue nextCandidate
 746				}
 747				if bo < lines[i].end {
 748					hits++
 749					continue nextChild
 750				}
 751				// move the `lines` iterator forward until bo < line.end
 752				for i < len(lines) && bo >= lines[i].end {
 753					i++
 754				}
 755				i--
 756				continue nextLine
 757			}
 758		}
 759		// return early once we found any line that contains matches from all children
 760		if hits == len(t.children) {
 761			return matchesFound
 762		}
 763	}
 764	return matchesNone
 765}
 766
 767func (t *andMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
 768	// We have found matches unless a child needs to do more work or it hasn't
 769	// found matches.
 770	state := matchesFound
 771
 772	for _, ch := range t.children {
 773		switch evalMatchTree(cp, cost, known, ch) {
 774		case matchesRequiresHigherCost:
 775			// keep evaluating other children incase we come across matchesNone
 776			state = matchesRequiresHigherCost
 777		case matchesFound:
 778			// will return this if every child has this value
 779		case matchesNone:
 780			return matchesNone
 781		}
 782	}
 783
 784	return state
 785}
 786
 787func (t *orMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
 788	// we could short-circuit, but we want to use the other possibilities as a
 789	// ranking signal. So we always return the most conservative state.
 790	state := matchesNone
 791	for _, ch := range t.children {
 792		switch evalMatchTree(cp, cost, known, ch) {
 793		case matchesRequiresHigherCost:
 794			state = matchesRequiresHigherCost
 795		case matchesFound:
 796			if state != matchesRequiresHigherCost {
 797				state = matchesFound
 798			}
 799		case matchesNone:
 800			// noop
 801		}
 802	}
 803	return state
 804}
 805
 806func (t *branchQueryMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
 807	return matchesStatePred(t.branchMask() != 0)
 808}
 809
 810func (t *regexpMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
 811	if t.reEvaluated {
 812		return matchesStateForSlice(t.found)
 813	}
 814
 815	if cost < costRegexp {
 816		return matchesRequiresHigherCost
 817	}
 818
 819	cp.stats.RegexpsConsidered++
 820	data := cp.data(t.fileName)
 821	// For file content, use hybridRegexp which dispatches to go-re2 when
 822	// len(data) >= ZOEKT_RE2_THRESHOLD_BYTES. For filename matching, use
 823	// grafana/regexp directly: filenames are always short, so the WASM
 824	// call overhead of go-re2 outweighs any benefit.
 825	var idxs [][]int
 826	if t.fileName {
 827		idxs = t.regexp.FindAllIndex(data, -1)
 828	} else {
 829		idxs = t.hybridRegexp.FindAllIndex(data, -1)
 830	}
 831	found := t.found[:0]
 832	for _, idx := range idxs {
 833		cm := &candidateMatch{
 834			byteOffset:  uint32(idx[0]),
 835			byteMatchSz: uint32(idx[1] - idx[0]),
 836			fileName:    t.fileName,
 837		}
 838
 839		found = append(found, cm)
 840	}
 841	t.found = found
 842	t.reEvaluated = true
 843
 844	return matchesStateForSlice(t.found)
 845}
 846
 847func (t *wordMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
 848	if t.evaluated {
 849		return matchesStateForSlice(t.found)
 850	}
 851
 852	if cost < costRegexp {
 853		return matchesRequiresHigherCost
 854	}
 855
 856	data := cp.data(t.fileName)
 857	offset := 0
 858	found := t.found[:0]
 859	for {
 860		idx := bytes.Index(data[offset:], []byte(t.word))
 861		if idx < 0 {
 862			break
 863		}
 864
 865		relStartOffset := offset + idx
 866		relEndOffset := relStartOffset + len(t.word)
 867
 868		startBoundary := relStartOffset < len(data) && (relStartOffset == 0 || !characterClass(data[relStartOffset-1]))
 869		endBoundary := relEndOffset > 0 && (relEndOffset == len(data) || !characterClass(data[relEndOffset]))
 870		if startBoundary && endBoundary {
 871			found = append(found, &candidateMatch{
 872				byteOffset:  uint32(offset + idx),
 873				byteMatchSz: uint32(len(t.word)),
 874				fileName:    t.fileName,
 875			})
 876		}
 877		offset += idx + len(t.word)
 878	}
 879
 880	t.found = found
 881	t.evaluated = true
 882
 883	return matchesStateForSlice(t.found)
 884}
 885
 886// breakMatchesOnNewlines returns matches resulting from breaking each element
 887// of cms on newlines within text.
 888func breakMatchesOnNewlines(cms []*candidateMatch, text []byte) []*candidateMatch {
 889	var lineCMs []*candidateMatch
 890	for _, cm := range cms {
 891		lineCMs = append(lineCMs, breakOnNewlines(cm, text)...)
 892	}
 893	return lineCMs
 894}
 895
 896// breakOnNewlines returns matches resulting from breaking cm on newlines
 897// within text.
 898func breakOnNewlines(cm *candidateMatch, text []byte) []*candidateMatch {
 899	var cms []*candidateMatch
 900	addMe := &candidateMatch{}
 901	*addMe = *cm
 902	for i := uint32(cm.byteOffset); i < cm.byteOffset+cm.byteMatchSz; i++ {
 903		if text[i] == '\n' {
 904			addMe.byteMatchSz = i - addMe.byteOffset
 905			if addMe.byteMatchSz != 0 {
 906				cms = append(cms, addMe)
 907			}
 908
 909			addMe = &candidateMatch{}
 910			*addMe = *cm
 911			addMe.byteOffset = i + 1
 912		}
 913	}
 914	addMe.byteMatchSz = cm.byteOffset + cm.byteMatchSz - addMe.byteOffset
 915	if addMe.byteMatchSz != 0 {
 916		cms = append(cms, addMe)
 917	}
 918	return cms
 919}
 920
 921// evalMatchTree should be called instead of directly calling
 922// matchTree.matches. It cache known values for future evaluation at higher
 923// costs.
 924func evalMatchTree(cp *contentProvider, cost int, known map[matchTree]bool, mt matchTree) matchesState {
 925	if v, ok := known[mt]; ok {
 926		return matchesStatePred(v)
 927	}
 928
 929	ms := mt.matches(cp, cost, known)
 930	if ms != matchesRequiresHigherCost {
 931		known[mt] = ms == matchesFound
 932	}
 933
 934	return ms
 935}
 936
 937func (t *notMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
 938	switch evalMatchTree(cp, cost, known, t.child) {
 939	case matchesRequiresHigherCost:
 940		return matchesRequiresHigherCost
 941	case matchesFound:
 942		return matchesNone
 943	case matchesNone:
 944		return matchesFound
 945	default:
 946		panic("unreachable")
 947	}
 948}
 949
 950func (t *fileNameMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
 951	return evalMatchTree(cp, cost, known, t.child)
 952}
 953
 954func (t *boostMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
 955	return evalMatchTree(cp, cost, known, t.child)
 956}
 957
 958func (t *substrMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
 959	if t.contEvaluated {
 960		return matchesStateForSlice(t.current)
 961	}
 962
 963	if len(t.current) == 0 {
 964		return matchesNone
 965	}
 966
 967	if t.fileName && cost < costMemory {
 968		return matchesRequiresHigherCost
 969	}
 970
 971	if !t.fileName && cost < costContent {
 972		return matchesRequiresHigherCost
 973	}
 974
 975	pruned := t.current[:0]
 976	for _, m := range t.current {
 977		if m.byteOffset == 0 && m.runeOffset > 0 {
 978			m.byteOffset = cp.findOffset(m.fileName, m.runeOffset)
 979		}
 980		if m.matchContent(cp.data(m.fileName)) {
 981			pruned = append(pruned, m)
 982		}
 983	}
 984	t.current = pruned
 985	t.contEvaluated = true
 986
 987	return matchesStateForSlice(t.current)
 988}
 989
 990type matchTreeOpt struct {
 991	// DisableWordMatchOptimization is used to disable the use of wordMatchTree.
 992	// This was added since we do not support wordMatchTree with symbol search.
 993	DisableWordMatchOptimization bool
 994}
 995
 996func (d *indexData) newMatchTree(q query.Q, opt matchTreeOpt) (matchTree, error) {
 997	if q == nil {
 998		return nil, fmt.Errorf("got nil (sub)query")
 999	}
1000
1001	switch s := q.(type) {
1002	case *query.Regexp:
1003		// RegexpToMatchTreeRecursive tries to distill a matchTree that matches a
1004		// superset of the regexp. If the returned matchTree is equivalent to the
1005		// original regexp, it returns true. An equivalent matchTree has the same
1006		// behaviour as the original regexp and can be used instead.
1007		//
1008		subMT, isEq, _, err := d.regexpToMatchTreeRecursive(s.Regexp, ngramSize, s.FileName, s.CaseSensitive)
1009		if err != nil {
1010			return nil, err
1011		}
1012		// if the query can be used in place of the regexp
1013		// return the subtree
1014		if isEq {
1015			return subMT, nil
1016		}
1017
1018		var tr matchTree
1019		if wmt, ok := regexpToWordMatchTree(s, opt); ok {
1020			// A common search we get is "\bLITERAL\b". Avoid the regex engine and
1021			// provide something faster.
1022			tr = wmt
1023		} else {
1024			tr = newRegexpMatchTree(s)
1025		}
1026
1027		return &andMatchTree{
1028			children: []matchTree{
1029				tr, &noVisitMatchTree{subMT},
1030			},
1031		}, nil
1032	case *query.And:
1033		var r []matchTree
1034		for _, ch := range s.Children {
1035			ct, err := d.newMatchTree(ch, opt)
1036			if err != nil {
1037				return nil, err
1038			}
1039			r = append(r, ct)
1040		}
1041		return &andMatchTree{r}, nil
1042	case *query.Or:
1043		var r []matchTree
1044		for _, ch := range s.Children {
1045			ct, err := d.newMatchTree(ch, opt)
1046			if err != nil {
1047				return nil, err
1048			}
1049			r = append(r, ct)
1050		}
1051		return &orMatchTree{r}, nil
1052	case *query.Not:
1053		ct, err := d.newMatchTree(s.Child, opt)
1054		return &notMatchTree{
1055			child: ct,
1056		}, err
1057
1058	case *query.Type:
1059		if s.Type != query.TypeFileName {
1060			break
1061		}
1062
1063		ct, err := d.newMatchTree(s.Child, opt)
1064		if err != nil {
1065			return nil, err
1066		}
1067
1068		return &fileNameMatchTree{
1069			child: ct,
1070		}, nil
1071
1072	case *query.Boost:
1073		ct, err := d.newMatchTree(s.Child, opt)
1074		if err != nil {
1075			return nil, err
1076		}
1077
1078		return &boostMatchTree{
1079			child: ct,
1080			boost: s.Boost,
1081		}, nil
1082
1083	case *query.Meta:
1084		checksum := queryMetaChecksum(s.Field, s.Value)
1085		cacheKeyField := "Meta"
1086		if cached, ok := d.docMatchTreeCache.Get(cacheKeyField, checksum); ok {
1087			return cached, nil
1088		}
1089
1090		reposWant := make([]bool, len(d.repoMetaData))
1091		for repoIdx, r := range d.repoMetaData {
1092			if r.Metadata != nil {
1093				if val, ok := r.Metadata[s.Field]; ok && s.Value.MatchString(val) {
1094					reposWant[repoIdx] = true
1095				}
1096			}
1097		}
1098
1099		mt := &docMatchTree{
1100			reason:  "Meta",
1101			numDocs: d.numDocs(),
1102			predicate: func(docID uint32) bool {
1103				repoIdx := d.repos[docID]
1104				if int(repoIdx) >= len(reposWant) {
1105					return false
1106				}
1107				return reposWant[repoIdx]
1108			},
1109		}
1110		d.docMatchTreeCache.Add(cacheKeyField, checksum, mt)
1111		return mt, nil
1112
1113	case *query.Substring:
1114		return d.newSubstringMatchTree(s)
1115
1116	case *query.Branch:
1117		masks := make([]uint64, 0, len(d.repoMetaData))
1118		if s.Pattern == "HEAD" {
1119			for range d.repoMetaData {
1120				masks = append(masks, 1)
1121			}
1122		} else {
1123			for _, branchIDs := range d.branchIDs {
1124				mask := uint64(0)
1125				for nm, m := range branchIDs {
1126					if (s.Exact && nm == s.Pattern) || (!s.Exact && strings.Contains(nm, s.Pattern)) {
1127						mask |= uint64(m)
1128					}
1129				}
1130				masks = append(masks, mask)
1131			}
1132		}
1133		return &branchQueryMatchTree{
1134			masks:     masks,
1135			fileMasks: d.fileBranchMasks,
1136			repos:     d.repos,
1137		}, nil
1138	case *query.Const:
1139		if s.Value {
1140			return &bruteForceMatchTree{}, nil
1141		} else {
1142			return &noMatchTree{Why: "const"}, nil
1143		}
1144	case *query.Language:
1145		code, ok := d.metaData.LanguageMap[s.Language]
1146		if !ok {
1147			return &noMatchTree{Why: "lang"}, nil
1148		}
1149		return &docMatchTree{
1150			reason:  "language",
1151			numDocs: d.numDocs(),
1152			predicate: func(docID uint32) bool {
1153				return d.getLanguage(docID) == code
1154			},
1155		}, nil
1156
1157	case *query.Symbol:
1158		// Disable WordMatchTree since we don't support it in symbols yet.
1159		optCopy := opt
1160		optCopy.DisableWordMatchOptimization = true
1161
1162		subMT, err := d.newMatchTree(s.Expr, optCopy)
1163		if err != nil {
1164			return nil, err
1165		}
1166
1167		if substr, ok := subMT.(*substrMatchTree); ok {
1168			return &symbolSubstrMatchTree{
1169				substrMatchTree: substr,
1170				patternSize:     uint32(utf8.RuneCountInString(substr.query.Pattern)),
1171				fileEndRunes:    d.fileEndRunes,
1172				fileEndSymbol:   d.fileEndSymbol,
1173				sections:        d.runeDocSections,
1174			}, nil
1175		}
1176
1177		var regexpMT *regexpMatchTree
1178		visitMatchTree(subMT, func(mt matchTree) {
1179			if t, ok := mt.(*regexpMatchTree); ok {
1180				regexpMT = t
1181			}
1182		})
1183		if regexpMT == nil {
1184			return nil, fmt.Errorf("found %T inside query.Symbol", subMT)
1185		}
1186
1187		return &symbolRegexpMatchTree{
1188			regexp:    regexpMT.regexp,
1189			all:       isRegexpAll(regexpMT.origRegexp),
1190			matchTree: subMT,
1191		}, nil
1192
1193	case *query.FileNameSet:
1194		return &docMatchTree{
1195			reason:  "FileNameSet",
1196			numDocs: d.numDocs(),
1197			predicate: func(docID uint32) bool {
1198				fileName := d.fileName(docID)
1199				_, ok := s.Set[string(fileName)]
1200				return ok
1201			},
1202		}, nil
1203
1204	case *query.BranchesRepos:
1205		reposBranchesWant := make([]uint64, len(d.repoMetaData))
1206		for repoIdx := range d.repoMetaData {
1207			var mask uint64
1208			for _, br := range s.List {
1209				if br.Repos.Contains(d.repoMetaData[repoIdx].ID) {
1210					mask |= uint64(d.branchIDs[repoIdx][br.Branch])
1211				}
1212			}
1213			reposBranchesWant[repoIdx] = mask
1214		}
1215		return &docMatchTree{
1216			reason:  "BranchesRepos",
1217			numDocs: d.numDocs(),
1218			predicate: func(docID uint32) bool {
1219				return d.fileBranchMasks[docID]&reposBranchesWant[d.repos[docID]] != 0
1220			},
1221		}, nil
1222
1223	case *query.RepoSet:
1224		reposWant := make([]bool, len(d.repoMetaData))
1225		for repoIdx, r := range d.repoMetaData {
1226			if _, ok := s.Set[r.Name]; ok {
1227				reposWant[repoIdx] = true
1228			}
1229		}
1230		return &docMatchTree{
1231			reason:  "RepoSet",
1232			numDocs: d.numDocs(),
1233			predicate: func(docID uint32) bool {
1234				return reposWant[d.repos[docID]]
1235			},
1236		}, nil
1237
1238	case *query.RepoIDs:
1239		reposWant := make([]bool, len(d.repoMetaData))
1240		for repoIdx, r := range d.repoMetaData {
1241			if s.Repos.Contains(r.ID) {
1242				reposWant[repoIdx] = true
1243			}
1244		}
1245		return &docMatchTree{
1246			reason:  "RepoIDs",
1247			numDocs: d.numDocs(),
1248			predicate: func(docID uint32) bool {
1249				return reposWant[d.repos[docID]]
1250			},
1251		}, nil
1252
1253	case *query.Repo:
1254		reposWant := make([]bool, len(d.repoMetaData))
1255		for repoIdx, r := range d.repoMetaData {
1256			if s.Regexp.MatchString(r.Name) {
1257				reposWant[repoIdx] = true
1258			}
1259		}
1260		return &docMatchTree{
1261			reason:  "Repo",
1262			numDocs: d.numDocs(),
1263			predicate: func(docID uint32) bool {
1264				return reposWant[d.repos[docID]]
1265			},
1266		}, nil
1267
1268	case *query.RepoRegexp:
1269		reposWant := make([]bool, len(d.repoMetaData))
1270		for repoIdx, r := range d.repoMetaData {
1271			if s.Regexp.MatchString(r.Name) {
1272				reposWant[repoIdx] = true
1273			}
1274		}
1275		return &docMatchTree{
1276			reason:  "RepoRegexp",
1277			numDocs: d.numDocs(),
1278			predicate: func(docID uint32) bool {
1279				return reposWant[d.repos[docID]]
1280			},
1281		}, nil
1282
1283	case query.RawConfig:
1284		return &docMatchTree{
1285			reason:  s.String(),
1286			numDocs: d.numDocs(),
1287			predicate: func(docID uint32) bool {
1288				return uint8(s)&d.rawConfigMasks[d.repos[docID]] == uint8(s)
1289			},
1290		}, nil
1291	}
1292	log.Panicf("type %T", q)
1293	return nil, nil
1294}
1295
1296func (d *indexData) newSubstringMatchTree(s *query.Substring) (matchTree, error) {
1297	st := &substrMatchTree{
1298		query:         s,
1299		caseSensitive: s.CaseSensitive,
1300		fileName:      s.FileName,
1301	}
1302
1303	if utf8.RuneCountInString(s.Pattern) < ngramSize {
1304		return newRegexpMatchTree(&query.Regexp{
1305			Regexp:        &syntax.Regexp{Op: syntax.OpLiteral, Rune: []rune(s.Pattern)},
1306			FileName:      s.FileName,
1307			Content:       s.Content,
1308			CaseSensitive: s.CaseSensitive,
1309		}), nil
1310	}
1311
1312	result, err := d.iterateNgrams(s)
1313	if err != nil {
1314		return nil, err
1315	}
1316	st.matchIterator = result
1317	return st, nil
1318}
1319
1320func regexpToWordMatchTree(q *query.Regexp, opt matchTreeOpt) (_ *wordMatchTree, ok bool) {
1321	if opt.DisableWordMatchOptimization {
1322		return nil, false
1323	}
1324	// Needs to be case sensitive
1325	if !q.CaseSensitive || q.Regexp.Flags&syntax.FoldCase != 0 {
1326		return nil, false
1327	}
1328	// We want a regex that looks like Op.Concat[OpWordBoundary OpLiteral OpWordBoundary]
1329	if q.Regexp.Op != syntax.OpConcat || len(q.Regexp.Sub) != 3 {
1330		return nil, false
1331	}
1332	sub := q.Regexp.Sub
1333	if sub[0].Op != syntax.OpWordBoundary || sub[1].Op != syntax.OpLiteral || sub[2].Op != syntax.OpWordBoundary {
1334		return nil, false
1335	}
1336
1337	return &wordMatchTree{
1338		word:     string(sub[1].Rune),
1339		fileName: q.FileName,
1340	}, true
1341}
1342
1343// pruneMatchTree removes impossible branches from the matchTree, as indicated
1344// by substrMatchTree having a noMatchTree and the resulting impossible and clauses and so forth.
1345func pruneMatchTree(mt matchTree) (matchTree, error) {
1346	var err error
1347	switch mt := mt.(type) {
1348	// leaf nodes that we test with the filter:
1349	case *substrMatchTree:
1350		if res, ok := mt.matchIterator.(*ngramIterationResults); ok {
1351			if _, ok := res.matchIterator.(*noMatchTree); ok {
1352				return nil, nil
1353			}
1354		}
1355	// recursive tree structures:
1356	case *andMatchTree:
1357		// Any branch of an and becoming impossible means the entire clause
1358		// is impossible. Otherwise, just handle rewrites.
1359		for i, child := range mt.children {
1360			newChild, err := pruneMatchTree(child)
1361			if err != nil {
1362				return nil, err
1363			}
1364			if newChild == nil {
1365				return nil, nil
1366			}
1367			mt.children[i] = newChild
1368		}
1369	case *orMatchTree:
1370		// *All* branches of an OR becoming impossible means the entire clause
1371		// is impossible. Otherwise, drop impossible subclauses and handle
1372		// rewrites, including simplifying to a singular resulting child branch.
1373		n := 0
1374		for _, child := range mt.children {
1375			newChild, err := pruneMatchTree(child)
1376			if err != nil {
1377				return nil, err
1378			}
1379			if newChild != nil {
1380				mt.children[n] = newChild
1381				n++
1382			}
1383		}
1384		mt.children = mt.children[:n]
1385		if len(mt.children) == 1 {
1386			return mt.children[0], nil
1387		} else if len(mt.children) == 0 {
1388			return nil, nil
1389		}
1390	case *noVisitMatchTree:
1391		mt.matchTree, err = pruneMatchTree(mt.matchTree)
1392		if err != nil {
1393			return nil, err
1394		}
1395		if mt.matchTree == nil {
1396			return nil, nil
1397		}
1398	case *fileNameMatchTree:
1399		mt.child, err = pruneMatchTree(mt.child)
1400		if err != nil {
1401			return nil, err
1402		}
1403		if mt.child == nil {
1404			return nil, nil
1405		}
1406	case *boostMatchTree:
1407		mt.child, err = pruneMatchTree(mt.child)
1408		if err != nil {
1409			return nil, err
1410		}
1411		if mt.child == nil {
1412			return nil, nil
1413		}
1414	case *andLineMatchTree:
1415		child, err := pruneMatchTree(&mt.andMatchTree)
1416		if err != nil {
1417			return nil, err
1418		}
1419		if child == nil {
1420			return nil, nil
1421		}
1422		if c, ok := child.(*andMatchTree); ok {
1423			mt.andMatchTree = *c
1424		} else {
1425			// the and was simplified to a single clause,
1426			// so the linematch portion is irrelevant.
1427			return mt, nil
1428		}
1429	case *notMatchTree:
1430		mt.child, err = pruneMatchTree(mt.child)
1431		if err != nil {
1432			return nil, err
1433		}
1434		if mt.child == nil {
1435			// not false => true
1436			return &bruteForceMatchTree{}, nil
1437		}
1438	// unhandled:
1439	case *docMatchTree:
1440	case *bruteForceMatchTree:
1441	case *regexpMatchTree:
1442	case *wordMatchTree:
1443	}
1444	return mt, err
1445}
1446
1447// isRegexpAll returns true if the query matches all possible lines.
1448//
1449// Note: it is possible for a funky regex to actually match all but this
1450// returns false. This returns true for normal looking regexes like ".*" or
1451// "(?-s:.*)".
1452func isRegexpAll(r *syntax.Regexp) bool {
1453	// Note: we don't care about any flags since we are looking for .* and we
1454	// don't mind if . matches all or everything but newline.
1455
1456	// for loop is for visiting children of OpCapture/OpConcat until we hit .*
1457	for {
1458		// Our main target: .*
1459		if r.Op == syntax.OpStar && len(r.Sub) == 1 { // *
1460			// (?s:.) or (?-s:.)
1461			return r.Sub[0].Op == syntax.OpAnyChar || r.Sub[0].Op == syntax.OpAnyCharNotNL
1462		}
1463
1464		// Strip away expressions being wrapped in paranthesis
1465		if (r.Op == syntax.OpCapture || r.Op == syntax.OpConcat) && len(r.Sub) == 1 {
1466			r = r.Sub[0]
1467			continue
1468		}
1469
1470		return false
1471	}
1472}
1473
1474func queryMetaChecksum(field string, value *regexp.Regexp) string {
1475	h := xxhash.New()
1476	h.Write([]byte(field))
1477	h.Write([]byte{':'})
1478	h.Write([]byte(value.String()))
1479	return fmt.Sprintf("%x", h.Sum64())
1480}
Configure Feed

Configure Feed