index/matchtree.go at 82531c7405372ff7eba501f766904290b6970e7d · boltless.me/zoekt

fork of https://github.com/sourcegraph/zoekt
zoekt / index / matchtree.go
at 82531c7405372ff7eba501f766904290b6970e7d 35 kB View raw
John Mason Cache docMatchTree results for Meta conditions (#982) 9mo ago
   1// Copyright 2018 Google Inc. All rights reserved.
   2//
   3// Licensed under the Apache License, Version 2.0 (the "License");
   4// you may not use this file except in compliance with the License.
   5// You may obtain a copy of the License at
   6//
   7//    http://www.apache.org/licenses/LICENSE-2.0
   8//
   9// Unless required by applicable law or agreed to in writing, software
  10// distributed under the License is distributed on an "AS IS" BASIS,
  11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12// See the License for the specific language governing permissions and
  13// limitations under the License.
  14
  15package index
  16
  17import (
  18	"bytes"
  19	"fmt"
  20	"log"
  21	"math"
  22	"regexp/syntax"
  23	"strings"
  24	"unicode/utf8"
  25
  26	"github.com/cespare/xxhash/v2"
  27	"github.com/grafana/regexp"
  28
  29	"github.com/sourcegraph/zoekt"
  30	"github.com/sourcegraph/zoekt/internal/syntaxutil"
  31	"github.com/sourcegraph/zoekt/query"
  32)
  33
  34// A docIterator iterates over documents in order.
  35type docIterator interface {
  36	// provide the next document where we may find something interesting.
  37	//
  38	// This is like a "peek" and shouldn't mutate state. prepare is what should
  39	// change state.
  40	nextDoc() uint32
  41
  42	// clears any per-document state of the docIterator, and
  43	// prepares for evaluating the given doc. The argument is
  44	// strictly increasing over time.
  45	prepare(nextDoc uint32)
  46}
  47
  48// costs are passed in increasing order to matchTree.matches until they do not
  49// return matchesRequiresHigherCost.
  50const (
  51	costConst   = 0
  52	costMemory  = 1
  53	costContent = 2
  54	costRegexp  = 3
  55)
  56
  57const (
  58	costMin = costConst
  59	costMax = costRegexp
  60)
  61
  62// matchesState is an enum for the state of a matchTree after a call to
  63// matchTree.matches.
  64type matchesState uint8
  65
  66const (
  67	// matchesRequiresHigherCost is returned when matchTree.matches hasn't done
  68	// a search yet since the cost value is not high enough.
  69	matchesRequiresHigherCost matchesState = iota
  70
  71	// matchesFound is returned when matchTree.matches has done a search and
  72	// found one or more matches.
  73	matchesFound
  74
  75	// matchesNone is returned when matchTree.matches has done a search and
  76	// found nothing.
  77	matchesNone
  78)
  79
  80// matchesStatePred is a helper which returns matchesFound if b is true
  81// otherwise returns matchesNone.
  82func matchesStatePred(b bool) matchesState {
  83	if b {
  84		return matchesFound
  85	}
  86	return matchesNone
  87}
  88
  89// matchesStateForSlice is a helper which returns matchesFound if v is
  90// non-empty otherwise returns matchesNone.
  91func matchesStateForSlice[T any](v []T) matchesState {
  92	return matchesStatePred(len(v) > 0)
  93}
  94
  95// An expression tree coupled with matches. The matchtree has two
  96// functions:
  97//
  98// * it implements boolean combinations (and, or, not)
  99//
 100// * it implements shortcuts, where we skip documents (for example: if
 101// there are no trigram matches, we can be sure there are no substring
 102// matches). The matchtree iterates over the documents as they are
 103// ordered in the shard.
 104//
 105// The general process for a given (shard, query) is:
 106//
 107// - construct matchTree for the query
 108//
 109// - find all different leaf matchTrees (substring, regexp, etc.)
 110//
 111// in a loop:
 112//
 113//   - find next doc to process using nextDoc
 114//
 115//   - evaluate atoms (leaf expressions that match text)
 116//
 117//   - evaluate the tree using matches(), storing the result in map.
 118//
 119//   - if the complete tree returns (matches() != matchesRequiresHigherCost)
 120//     for the document, collect all text matches by looking at leaf
 121//     matchTrees.
 122type matchTree interface {
 123	docIterator
 124
 125	// matches if cost is high enough. See documentation for matchesState's
 126	// values.
 127	//
 128	// Note: Do not call this directly, rather use evalMatchTree which uses
 129	// known to cache responses once the state transitions away from
 130	// matchesRequiresHigherCost.
 131	matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState
 132}
 133
 134// docMatchTree iterates over documents for which predicate(docID) returns true.
 135type docMatchTree struct {
 136	// the number of documents in a shard.
 137	numDocs uint32
 138
 139	predicate func(docID uint32) bool
 140
 141	// provides additional information about the reason why the docMatchTree was
 142	// created.
 143	reason string
 144
 145	// mutable
 146	firstDone bool
 147	docID     uint32
 148}
 149
 150type bruteForceMatchTree struct {
 151	// mutable
 152	firstDone bool
 153	docID     uint32
 154}
 155
 156type andLineMatchTree struct {
 157	andMatchTree
 158}
 159
 160type andMatchTree struct {
 161	children []matchTree
 162}
 163
 164type orMatchTree struct {
 165	children []matchTree
 166}
 167
 168type notMatchTree struct {
 169	child matchTree
 170}
 171
 172// Returns only the filename of child matches.
 173type fileNameMatchTree struct {
 174	child matchTree
 175}
 176
 177type boostMatchTree struct {
 178	child matchTree
 179	boost float64
 180}
 181
 182// Don't visit this subtree for collecting matches.
 183type noVisitMatchTree struct {
 184	matchTree
 185}
 186
 187type regexpMatchTree struct {
 188	regexp *regexp.Regexp
 189
 190	// origRegexp is the original parsed regexp from the query structure. It
 191	// does not include mutations such as case sensitivity.
 192	origRegexp *syntax.Regexp
 193
 194	fileName bool
 195
 196	// mutable
 197	reEvaluated bool
 198	found       []*candidateMatch
 199
 200	// nextDoc, prepare.
 201	bruteForceMatchTree
 202}
 203
 204func newRegexpMatchTree(s *query.Regexp) *regexpMatchTree {
 205	prefix := ""
 206	if !s.CaseSensitive {
 207		prefix = "(?i)"
 208	}
 209
 210	return &regexpMatchTree{
 211		regexp:     regexp.MustCompile(prefix + syntaxutil.RegexpString(s.Regexp)),
 212		origRegexp: s.Regexp,
 213		fileName:   s.FileName,
 214	}
 215}
 216
 217// \bLITERAL\b
 218type wordMatchTree struct {
 219	word string
 220
 221	fileName bool
 222
 223	// mutable
 224	evaluated bool
 225	found     []*candidateMatch
 226
 227	// nextDoc, prepare.
 228	bruteForceMatchTree
 229}
 230
 231type substrMatchTree struct {
 232	matchIterator
 233
 234	query         *query.Substring
 235	caseSensitive bool
 236	fileName      bool
 237
 238	// mutable
 239	current       []*candidateMatch
 240	contEvaluated bool
 241}
 242
 243type branchQueryMatchTree struct {
 244	fileMasks []uint64
 245	masks     []uint64
 246	repos     []uint16
 247
 248	// mutable
 249	firstDone bool
 250	docID     uint32
 251}
 252
 253func (t *branchQueryMatchTree) branchMask() uint64 {
 254	return t.fileMasks[t.docID] & t.masks[t.repos[t.docID]]
 255}
 256
 257type symbolRegexpMatchTree struct {
 258	matchTree
 259	regexp *regexp.Regexp
 260	all    bool // skips regex match if .*
 261
 262	reEvaluated bool
 263	found       []*candidateMatch
 264}
 265
 266func (t *symbolRegexpMatchTree) prepare(doc uint32) {
 267	t.reEvaluated = false
 268	t.found = t.found[:0]
 269	t.matchTree.prepare(doc)
 270}
 271
 272func (t *symbolRegexpMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
 273	if t.reEvaluated {
 274		return matchesStateForSlice(t.found)
 275	}
 276
 277	if cost < costRegexp {
 278		return matchesRequiresHigherCost
 279	}
 280
 281	sections := cp.docSections()
 282	content := cp.data(false)
 283
 284	found := t.found[:0]
 285	for i, sec := range sections {
 286		var idx []int
 287		if t.all {
 288			idx = []int{0, int(sec.End - sec.Start)}
 289		} else {
 290			idx = t.regexp.FindIndex(content[sec.Start:sec.End])
 291			if idx == nil {
 292				continue
 293			}
 294		}
 295
 296		cm := &candidateMatch{
 297			byteOffset:  sec.Start + uint32(idx[0]),
 298			byteMatchSz: uint32(idx[1] - idx[0]),
 299			symbol:      true,
 300			symbolIdx:   uint32(i),
 301		}
 302		found = append(found, cm)
 303	}
 304	t.found = found
 305	t.reEvaluated = true
 306
 307	return matchesStateForSlice(t.found)
 308}
 309
 310type symbolSubstrMatchTree struct {
 311	*substrMatchTree
 312
 313	patternSize   uint32
 314	fileEndRunes  []uint32
 315	fileEndSymbol []uint32
 316
 317	doc      uint32
 318	sections []DocumentSection
 319
 320	secID uint32
 321}
 322
 323func (t *symbolSubstrMatchTree) prepare(doc uint32) {
 324	t.substrMatchTree.prepare(doc)
 325	t.doc = doc
 326
 327	var fileStart uint32
 328	if doc > 0 {
 329		fileStart = t.fileEndRunes[doc-1]
 330	}
 331
 332	var sections []DocumentSection
 333	if len(t.sections) > 0 {
 334		most := t.fileEndSymbol[len(t.fileEndSymbol)-1]
 335		if most == uint32(len(t.sections)) {
 336			sections = t.sections[t.fileEndSymbol[doc]:t.fileEndSymbol[doc+1]]
 337		} else {
 338			for t.secID < uint32(len(t.sections)) && t.sections[t.secID].Start < fileStart {
 339				t.secID++
 340			}
 341
 342			fileEnd, symbolEnd := t.fileEndRunes[doc], t.secID
 343			for symbolEnd < uint32(len(t.sections)) && t.sections[symbolEnd].Start < fileEnd {
 344				symbolEnd++
 345			}
 346
 347			sections = t.sections[t.secID:symbolEnd]
 348		}
 349	}
 350
 351	secIdx := 0
 352	trimmed := t.current[:0]
 353	for len(sections) > secIdx && len(t.current) > 0 {
 354		start := fileStart + t.current[0].runeOffset
 355		end := start + t.patternSize
 356		if start >= sections[secIdx].End {
 357			secIdx++
 358			continue
 359		}
 360
 361		if start < sections[secIdx].Start {
 362			t.current = t.current[1:]
 363			continue
 364		}
 365
 366		if end <= sections[secIdx].End {
 367			t.current[0].symbol = true
 368			t.current[0].symbolIdx = uint32(secIdx)
 369			trimmed = append(trimmed, t.current[0])
 370		}
 371
 372		t.current = t.current[1:]
 373	}
 374	t.current = trimmed
 375}
 376
 377// all prepare methods
 378
 379func (t *bruteForceMatchTree) prepare(doc uint32) {
 380	t.docID = doc
 381	t.firstDone = true
 382}
 383
 384func (t *docMatchTree) prepare(doc uint32) {
 385	t.docID = doc
 386	t.firstDone = true
 387}
 388
 389func (t *andMatchTree) prepare(doc uint32) {
 390	for _, c := range t.children {
 391		c.prepare(doc)
 392	}
 393}
 394
 395func (t *regexpMatchTree) prepare(doc uint32) {
 396	t.found = t.found[:0]
 397	t.reEvaluated = false
 398	t.bruteForceMatchTree.prepare(doc)
 399}
 400
 401func (t *wordMatchTree) prepare(doc uint32) {
 402	t.found = t.found[:0]
 403	t.evaluated = false
 404	t.bruteForceMatchTree.prepare(doc)
 405}
 406
 407func (t *orMatchTree) prepare(doc uint32) {
 408	for _, c := range t.children {
 409		c.prepare(doc)
 410	}
 411}
 412
 413func (t *notMatchTree) prepare(doc uint32) {
 414	t.child.prepare(doc)
 415}
 416
 417func (t *fileNameMatchTree) prepare(doc uint32) {
 418	t.child.prepare(doc)
 419}
 420
 421func (t *boostMatchTree) prepare(doc uint32) {
 422	t.child.prepare(doc)
 423}
 424
 425func (t *substrMatchTree) prepare(nextDoc uint32) {
 426	t.matchIterator.prepare(nextDoc)
 427	t.current = t.matchIterator.candidates()
 428	t.contEvaluated = false
 429}
 430
 431func (t *branchQueryMatchTree) prepare(doc uint32) {
 432	t.firstDone = true
 433	t.docID = doc
 434}
 435
 436// nextDoc
 437
 438func (t *docMatchTree) nextDoc() uint32 {
 439	var start uint32
 440	if t.firstDone {
 441		start = t.docID + 1
 442	}
 443	for i := start; i < t.numDocs; i++ {
 444		if t.predicate(i) {
 445			return i
 446		}
 447	}
 448	return math.MaxUint32
 449}
 450
 451func (t *bruteForceMatchTree) nextDoc() uint32 {
 452	if !t.firstDone {
 453		return 0
 454	}
 455	return t.docID + 1
 456}
 457
 458func (t *andMatchTree) nextDoc() uint32 {
 459	var max uint32
 460	for _, c := range t.children {
 461		m := c.nextDoc()
 462		if m > max {
 463			max = m
 464		}
 465	}
 466	return max
 467}
 468
 469func (t *orMatchTree) nextDoc() uint32 {
 470	min := uint32(math.MaxUint32)
 471	for _, c := range t.children {
 472		m := c.nextDoc()
 473		if m < min {
 474			min = m
 475		}
 476	}
 477	return min
 478}
 479
 480func (t *notMatchTree) nextDoc() uint32 {
 481	return 0
 482}
 483
 484func (t *fileNameMatchTree) nextDoc() uint32 {
 485	return t.child.nextDoc()
 486}
 487
 488func (t *boostMatchTree) nextDoc() uint32 {
 489	return t.child.nextDoc()
 490}
 491
 492func (t *branchQueryMatchTree) nextDoc() uint32 {
 493	var start uint32
 494	if t.firstDone {
 495		start = t.docID + 1
 496	}
 497
 498	for i := start; i < uint32(len(t.fileMasks)); i++ {
 499		if (t.masks[t.repos[i]] & t.fileMasks[i]) != 0 {
 500			return i
 501		}
 502	}
 503	return math.MaxUint32
 504}
 505
 506// all String methods
 507
 508func (t *bruteForceMatchTree) String() string {
 509	return "all"
 510}
 511
 512func (t *docMatchTree) String() string {
 513	return fmt.Sprintf("doc(%s)", t.reason)
 514}
 515
 516func (t *andMatchTree) String() string {
 517	return fmt.Sprintf("and%v", t.children)
 518}
 519
 520func (t *regexpMatchTree) String() string {
 521	f := ""
 522	if t.fileName {
 523		f = "f"
 524	}
 525	return fmt.Sprintf("%sre(%s)", f, t.regexp)
 526}
 527
 528func (t *wordMatchTree) String() string {
 529	f := ""
 530	if t.fileName {
 531		f = "f"
 532	}
 533	return fmt.Sprintf("%sword(%s)", f, t.word)
 534}
 535
 536func (t *orMatchTree) String() string {
 537	return fmt.Sprintf("or%v", t.children)
 538}
 539
 540func (t *notMatchTree) String() string {
 541	return fmt.Sprintf("not(%v)", t.child)
 542}
 543
 544func (t *noVisitMatchTree) String() string {
 545	return fmt.Sprintf("novisit(%v)", t.matchTree)
 546}
 547
 548func (t *fileNameMatchTree) String() string {
 549	return fmt.Sprintf("f(%v)", t.child)
 550}
 551
 552func (t *boostMatchTree) String() string {
 553	return fmt.Sprintf("boost(%f, %v)", t.boost, t.child)
 554}
 555
 556func (t *substrMatchTree) String() string {
 557	f := ""
 558	if t.fileName {
 559		f = "f"
 560	}
 561
 562	return fmt.Sprintf("%ssubstr(%q, %v, %v)", f, t.query.Pattern, t.current, t.matchIterator)
 563}
 564
 565func (t *branchQueryMatchTree) String() string {
 566	return fmt.Sprintf("branch(%x)", t.masks)
 567}
 568
 569func (t *symbolSubstrMatchTree) String() string {
 570	return fmt.Sprintf("symbol(%v)", t.substrMatchTree)
 571}
 572
 573func (t *symbolRegexpMatchTree) String() string {
 574	return fmt.Sprintf("symbol(%v)", t.matchTree)
 575}
 576
 577// visitMatches visits all atoms in matchTree. Note: This visits
 578// noVisitMatchTree. For collecting matches use visitMatches.
 579func visitMatchTree(t matchTree, f func(matchTree)) {
 580	switch s := t.(type) {
 581	case *andMatchTree:
 582		for _, ch := range s.children {
 583			visitMatchTree(ch, f)
 584		}
 585	case *orMatchTree:
 586		for _, ch := range s.children {
 587			visitMatchTree(ch, f)
 588		}
 589	case *andLineMatchTree:
 590		visitMatchTree(&s.andMatchTree, f)
 591	case *noVisitMatchTree:
 592		visitMatchTree(s.matchTree, f)
 593	case *notMatchTree:
 594		visitMatchTree(s.child, f)
 595	case *fileNameMatchTree:
 596		visitMatchTree(s.child, f)
 597	case *boostMatchTree:
 598		visitMatchTree(s.child, f)
 599	case *symbolSubstrMatchTree:
 600		visitMatchTree(s.substrMatchTree, f)
 601	case *symbolRegexpMatchTree:
 602		visitMatchTree(s.matchTree, f)
 603	default:
 604		f(t)
 605	}
 606}
 607
 608// updateMatchTreeStats calls updateStats on all atoms in mt which have that
 609// function defined.
 610func updateMatchTreeStats(mt matchTree, stats *zoekt.Stats) {
 611	visitMatchTree(mt, func(mt matchTree) {
 612		if atom, ok := mt.(interface{ updateStats(*zoekt.Stats) }); ok {
 613			atom.updateStats(stats)
 614		}
 615	})
 616}
 617
 618func visitMatchAtoms(t matchTree, known map[matchTree]bool, f func(matchTree)) {
 619	visitMatches(t, known, 1, func(mt matchTree, _ float64) {
 620		f(mt)
 621	})
 622}
 623
 624// visitMatches visits all atoms which can contribute matches. Note: This
 625// skips noVisitMatchTree.
 626func visitMatches(t matchTree, known map[matchTree]bool, weight float64, f func(matchTree, float64)) {
 627	switch s := t.(type) {
 628	case *andMatchTree:
 629		for _, ch := range s.children {
 630			if known[ch] {
 631				visitMatches(ch, known, weight, f)
 632			}
 633		}
 634	case *andLineMatchTree:
 635		visitMatches(&s.andMatchTree, known, weight, f)
 636	case *orMatchTree:
 637		for _, ch := range s.children {
 638			if known[ch] {
 639				visitMatches(ch, known, weight, f)
 640			}
 641		}
 642	case *boostMatchTree:
 643		visitMatches(s.child, known, weight*s.boost, f)
 644	case *symbolSubstrMatchTree:
 645		visitMatches(s.substrMatchTree, known, weight, f)
 646	case *notMatchTree:
 647	case *noVisitMatchTree:
 648		// don't collect into negative trees.
 649	case *fileNameMatchTree:
 650		// We will just gather the filename if we do not visit this tree.
 651	default:
 652		f(s, weight)
 653	}
 654}
 655
 656// all matches() methods.
 657
 658func (t *docMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
 659	return matchesStatePred(t.predicate(cp.idx))
 660}
 661
 662func (t *bruteForceMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
 663	return matchesFound
 664}
 665
 666// andLineMatchTree is a performance optimization of andMatchTree. For content
 667// searches we don't want to run the regex engine if there is no line that
 668// contains matches from all terms.
 669func (t *andLineMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
 670	if state := evalMatchTree(cp, cost, known, &t.andMatchTree); state != matchesFound {
 671		return state
 672	}
 673
 674	// Invariant: all children have matches. If any line contains all of them we
 675	// can return MatchesFound.
 676
 677	// find child with fewest candidates
 678	minCount := math.MaxInt
 679	fewestChildren := 0
 680	for ix, child := range t.children {
 681		v, ok := child.(*substrMatchTree)
 682		// make sure we are running a content search and that all candidates are a
 683		// substrMatchTree
 684		if !ok || v.fileName {
 685			return matchesFound
 686		}
 687		if len(v.current) < minCount {
 688			minCount = len(v.current)
 689			fewestChildren = ix
 690		}
 691	}
 692
 693	type lineRange struct {
 694		start int
 695		end   int
 696	}
 697	lines := make([]lineRange, 0, len(t.children[fewestChildren].(*substrMatchTree).current))
 698	prev := -1
 699	for _, candidate := range t.children[fewestChildren].(*substrMatchTree).current {
 700		line := cp.newlines().atOffset(candidate.byteOffset)
 701		if line == prev {
 702			continue
 703		}
 704		prev = line
 705		byteStart := int(cp.newlines().lineStart(line))
 706		byteEnd := int(cp.newlines().lineStart(line + 1))
 707		lines = append(lines, lineRange{byteStart, byteEnd})
 708	}
 709
 710	// children keeps track of the children's candidates we have already seen.
 711	children := make([][]*candidateMatch, 0, len(t.children)-1)
 712	for j, child := range t.children {
 713		if j == fewestChildren {
 714			continue
 715		}
 716		children = append(children, child.(*substrMatchTree).current)
 717	}
 718
 719nextLine:
 720	for i := 0; i < len(lines); i++ {
 721		hits := 1
 722	nextChild:
 723		for j := range children {
 724		nextCandidate:
 725			for len(children[j]) > 0 {
 726				candidate := children[j][0]
 727				bo := int(cp.findOffset(false, candidate.runeOffset))
 728				if bo < lines[i].start {
 729					children[j] = children[j][1:]
 730					continue nextCandidate
 731				}
 732				if bo < lines[i].end {
 733					hits++
 734					continue nextChild
 735				}
 736				// move the `lines` iterator forward until bo < line.end
 737				for i < len(lines) && bo >= lines[i].end {
 738					i++
 739				}
 740				i--
 741				continue nextLine
 742			}
 743		}
 744		// return early once we found any line that contains matches from all children
 745		if hits == len(t.children) {
 746			return matchesFound
 747		}
 748	}
 749	return matchesNone
 750}
 751
 752func (t *andMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
 753	// We have found matches unless a child needs to do more work or it hasn't
 754	// found matches.
 755	state := matchesFound
 756
 757	for _, ch := range t.children {
 758		switch evalMatchTree(cp, cost, known, ch) {
 759		case matchesRequiresHigherCost:
 760			// keep evaluating other children incase we come across matchesNone
 761			state = matchesRequiresHigherCost
 762		case matchesFound:
 763			// will return this if every child has this value
 764		case matchesNone:
 765			return matchesNone
 766		}
 767	}
 768
 769	return state
 770}
 771
 772func (t *orMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
 773	// we could short-circuit, but we want to use the other possibilities as a
 774	// ranking signal. So we always return the most conservative state.
 775	state := matchesNone
 776	for _, ch := range t.children {
 777		switch evalMatchTree(cp, cost, known, ch) {
 778		case matchesRequiresHigherCost:
 779			state = matchesRequiresHigherCost
 780		case matchesFound:
 781			if state != matchesRequiresHigherCost {
 782				state = matchesFound
 783			}
 784		case matchesNone:
 785			// noop
 786		}
 787	}
 788	return state
 789}
 790
 791func (t *branchQueryMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
 792	return matchesStatePred(t.branchMask() != 0)
 793}
 794
 795func (t *regexpMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
 796	if t.reEvaluated {
 797		return matchesStateForSlice(t.found)
 798	}
 799
 800	if cost < costRegexp {
 801		return matchesRequiresHigherCost
 802	}
 803
 804	cp.stats.RegexpsConsidered++
 805	idxs := t.regexp.FindAllIndex(cp.data(t.fileName), -1)
 806	found := t.found[:0]
 807	for _, idx := range idxs {
 808		cm := &candidateMatch{
 809			byteOffset:  uint32(idx[0]),
 810			byteMatchSz: uint32(idx[1] - idx[0]),
 811			fileName:    t.fileName,
 812		}
 813
 814		found = append(found, cm)
 815	}
 816	t.found = found
 817	t.reEvaluated = true
 818
 819	return matchesStateForSlice(t.found)
 820}
 821
 822func (t *wordMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
 823	if t.evaluated {
 824		return matchesStateForSlice(t.found)
 825	}
 826
 827	if cost < costRegexp {
 828		return matchesRequiresHigherCost
 829	}
 830
 831	data := cp.data(t.fileName)
 832	offset := 0
 833	found := t.found[:0]
 834	for {
 835		idx := bytes.Index(data[offset:], []byte(t.word))
 836		if idx < 0 {
 837			break
 838		}
 839
 840		relStartOffset := offset + idx
 841		relEndOffset := relStartOffset + len(t.word)
 842
 843		startBoundary := relStartOffset < len(data) && (relStartOffset == 0 || !characterClass(data[relStartOffset-1]))
 844		endBoundary := relEndOffset > 0 && (relEndOffset == len(data) || !characterClass(data[relEndOffset]))
 845		if startBoundary && endBoundary {
 846			found = append(found, &candidateMatch{
 847				byteOffset:  uint32(offset + idx),
 848				byteMatchSz: uint32(len(t.word)),
 849				fileName:    t.fileName,
 850			})
 851		}
 852		offset += idx + len(t.word)
 853	}
 854
 855	t.found = found
 856	t.evaluated = true
 857
 858	return matchesStateForSlice(t.found)
 859}
 860
 861// breakMatchesOnNewlines returns matches resulting from breaking each element
 862// of cms on newlines within text.
 863func breakMatchesOnNewlines(cms []*candidateMatch, text []byte) []*candidateMatch {
 864	var lineCMs []*candidateMatch
 865	for _, cm := range cms {
 866		lineCMs = append(lineCMs, breakOnNewlines(cm, text)...)
 867	}
 868	return lineCMs
 869}
 870
 871// breakOnNewlines returns matches resulting from breaking cm on newlines
 872// within text.
 873func breakOnNewlines(cm *candidateMatch, text []byte) []*candidateMatch {
 874	var cms []*candidateMatch
 875	addMe := &candidateMatch{}
 876	*addMe = *cm
 877	for i := uint32(cm.byteOffset); i < cm.byteOffset+cm.byteMatchSz; i++ {
 878		if text[i] == '\n' {
 879			addMe.byteMatchSz = i - addMe.byteOffset
 880			if addMe.byteMatchSz != 0 {
 881				cms = append(cms, addMe)
 882			}
 883
 884			addMe = &candidateMatch{}
 885			*addMe = *cm
 886			addMe.byteOffset = i + 1
 887		}
 888	}
 889	addMe.byteMatchSz = cm.byteOffset + cm.byteMatchSz - addMe.byteOffset
 890	if addMe.byteMatchSz != 0 {
 891		cms = append(cms, addMe)
 892	}
 893	return cms
 894}
 895
 896// evalMatchTree should be called instead of directly calling
 897// matchTree.matches. It cache known values for future evaluation at higher
 898// costs.
 899func evalMatchTree(cp *contentProvider, cost int, known map[matchTree]bool, mt matchTree) matchesState {
 900	if v, ok := known[mt]; ok {
 901		return matchesStatePred(v)
 902	}
 903
 904	ms := mt.matches(cp, cost, known)
 905	if ms != matchesRequiresHigherCost {
 906		known[mt] = ms == matchesFound
 907	}
 908
 909	return ms
 910}
 911
 912func (t *notMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
 913	switch evalMatchTree(cp, cost, known, t.child) {
 914	case matchesRequiresHigherCost:
 915		return matchesRequiresHigherCost
 916	case matchesFound:
 917		return matchesNone
 918	case matchesNone:
 919		return matchesFound
 920	default:
 921		panic("unreachable")
 922	}
 923}
 924
 925func (t *fileNameMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
 926	return evalMatchTree(cp, cost, known, t.child)
 927}
 928
 929func (t *boostMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
 930	return evalMatchTree(cp, cost, known, t.child)
 931}
 932
 933func (t *substrMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
 934	if t.contEvaluated {
 935		return matchesStateForSlice(t.current)
 936	}
 937
 938	if len(t.current) == 0 {
 939		return matchesNone
 940	}
 941
 942	if t.fileName && cost < costMemory {
 943		return matchesRequiresHigherCost
 944	}
 945
 946	if !t.fileName && cost < costContent {
 947		return matchesRequiresHigherCost
 948	}
 949
 950	pruned := t.current[:0]
 951	for _, m := range t.current {
 952		if m.byteOffset == 0 && m.runeOffset > 0 {
 953			m.byteOffset = cp.findOffset(m.fileName, m.runeOffset)
 954		}
 955		if m.matchContent(cp.data(m.fileName)) {
 956			pruned = append(pruned, m)
 957		}
 958	}
 959	t.current = pruned
 960	t.contEvaluated = true
 961
 962	return matchesStateForSlice(t.current)
 963}
 964
 965type matchTreeOpt struct {
 966	// DisableWordMatchOptimization is used to disable the use of wordMatchTree.
 967	// This was added since we do not support wordMatchTree with symbol search.
 968	DisableWordMatchOptimization bool
 969}
 970
 971func (d *indexData) newMatchTree(q query.Q, opt matchTreeOpt) (matchTree, error) {
 972	if q == nil {
 973		return nil, fmt.Errorf("got nil (sub)query")
 974	}
 975
 976	switch s := q.(type) {
 977	case *query.Regexp:
 978		// RegexpToMatchTreeRecursive tries to distill a matchTree that matches a
 979		// superset of the regexp. If the returned matchTree is equivalent to the
 980		// original regexp, it returns true. An equivalent matchTree has the same
 981		// behaviour as the original regexp and can be used instead.
 982		//
 983		subMT, isEq, _, err := d.regexpToMatchTreeRecursive(s.Regexp, ngramSize, s.FileName, s.CaseSensitive)
 984		if err != nil {
 985			return nil, err
 986		}
 987		// if the query can be used in place of the regexp
 988		// return the subtree
 989		if isEq {
 990			return subMT, nil
 991		}
 992
 993		var tr matchTree
 994		if wmt, ok := regexpToWordMatchTree(s, opt); ok {
 995			// A common search we get is "\bLITERAL\b". Avoid the regex engine and
 996			// provide something faster.
 997			tr = wmt
 998		} else {
 999			tr = newRegexpMatchTree(s)
1000		}
1001
1002		return &andMatchTree{
1003			children: []matchTree{
1004				tr, &noVisitMatchTree{subMT},
1005			},
1006		}, nil
1007	case *query.And:
1008		var r []matchTree
1009		for _, ch := range s.Children {
1010			ct, err := d.newMatchTree(ch, opt)
1011			if err != nil {
1012				return nil, err
1013			}
1014			r = append(r, ct)
1015		}
1016		return &andMatchTree{r}, nil
1017	case *query.Or:
1018		var r []matchTree
1019		for _, ch := range s.Children {
1020			ct, err := d.newMatchTree(ch, opt)
1021			if err != nil {
1022				return nil, err
1023			}
1024			r = append(r, ct)
1025		}
1026		return &orMatchTree{r}, nil
1027	case *query.Not:
1028		ct, err := d.newMatchTree(s.Child, opt)
1029		return &notMatchTree{
1030			child: ct,
1031		}, err
1032
1033	case *query.Type:
1034		if s.Type != query.TypeFileName {
1035			break
1036		}
1037
1038		ct, err := d.newMatchTree(s.Child, opt)
1039		if err != nil {
1040			return nil, err
1041		}
1042
1043		return &fileNameMatchTree{
1044			child: ct,
1045		}, nil
1046
1047	case *query.Boost:
1048		ct, err := d.newMatchTree(s.Child, opt)
1049		if err != nil {
1050			return nil, err
1051		}
1052
1053		return &boostMatchTree{
1054			child: ct,
1055			boost: s.Boost,
1056		}, nil
1057
1058	case *query.Meta:
1059		checksum := queryMetaChecksum(s.Field, s.Value)
1060		cacheKeyField := "Meta"
1061		if cached, ok := d.docMatchTreeCache.Get(cacheKeyField, checksum); ok {
1062			return cached, nil
1063		}
1064
1065		reposWant := make([]bool, len(d.repoMetaData))
1066		for repoIdx, r := range d.repoMetaData {
1067			if r.Metadata != nil {
1068				if val, ok := r.Metadata[s.Field]; ok && s.Value.MatchString(val) {
1069					reposWant[repoIdx] = true
1070				}
1071			}
1072		}
1073
1074		mt := &docMatchTree{
1075			reason:  "Meta",
1076			numDocs: d.numDocs(),
1077			predicate: func(docID uint32) bool {
1078				repoIdx := d.repos[docID]
1079				if int(repoIdx) >= len(reposWant) {
1080					return false
1081				}
1082				return reposWant[repoIdx]
1083			},
1084		}
1085		d.docMatchTreeCache.Add(cacheKeyField, checksum, mt)
1086		return mt, nil
1087
1088	case *query.Substring:
1089		return d.newSubstringMatchTree(s)
1090
1091	case *query.Branch:
1092		masks := make([]uint64, 0, len(d.repoMetaData))
1093		if s.Pattern == "HEAD" {
1094			for range d.repoMetaData {
1095				masks = append(masks, 1)
1096			}
1097		} else {
1098			for _, branchIDs := range d.branchIDs {
1099				mask := uint64(0)
1100				for nm, m := range branchIDs {
1101					if (s.Exact && nm == s.Pattern) || (!s.Exact && strings.Contains(nm, s.Pattern)) {
1102						mask |= uint64(m)
1103					}
1104				}
1105				masks = append(masks, mask)
1106			}
1107		}
1108		return &branchQueryMatchTree{
1109			masks:     masks,
1110			fileMasks: d.fileBranchMasks,
1111			repos:     d.repos,
1112		}, nil
1113	case *query.Const:
1114		if s.Value {
1115			return &bruteForceMatchTree{}, nil
1116		} else {
1117			return &noMatchTree{Why: "const"}, nil
1118		}
1119	case *query.Language:
1120		code, ok := d.metaData.LanguageMap[s.Language]
1121		if !ok {
1122			return &noMatchTree{Why: "lang"}, nil
1123		}
1124		return &docMatchTree{
1125			reason:  "language",
1126			numDocs: d.numDocs(),
1127			predicate: func(docID uint32) bool {
1128				return d.getLanguage(docID) == code
1129			},
1130		}, nil
1131
1132	case *query.Symbol:
1133		// Disable WordMatchTree since we don't support it in symbols yet.
1134		optCopy := opt
1135		optCopy.DisableWordMatchOptimization = true
1136
1137		subMT, err := d.newMatchTree(s.Expr, optCopy)
1138		if err != nil {
1139			return nil, err
1140		}
1141
1142		if substr, ok := subMT.(*substrMatchTree); ok {
1143			return &symbolSubstrMatchTree{
1144				substrMatchTree: substr,
1145				patternSize:     uint32(utf8.RuneCountInString(substr.query.Pattern)),
1146				fileEndRunes:    d.fileEndRunes,
1147				fileEndSymbol:   d.fileEndSymbol,
1148				sections:        d.runeDocSections,
1149			}, nil
1150		}
1151
1152		var regexpMT *regexpMatchTree
1153		visitMatchTree(subMT, func(mt matchTree) {
1154			if t, ok := mt.(*regexpMatchTree); ok {
1155				regexpMT = t
1156			}
1157		})
1158		if regexpMT == nil {
1159			return nil, fmt.Errorf("found %T inside query.Symbol", subMT)
1160		}
1161
1162		return &symbolRegexpMatchTree{
1163			regexp:    regexpMT.regexp,
1164			all:       isRegexpAll(regexpMT.origRegexp),
1165			matchTree: subMT,
1166		}, nil
1167
1168	case *query.FileNameSet:
1169		return &docMatchTree{
1170			reason:  "FileNameSet",
1171			numDocs: d.numDocs(),
1172			predicate: func(docID uint32) bool {
1173				fileName := d.fileName(docID)
1174				_, ok := s.Set[string(fileName)]
1175				return ok
1176			},
1177		}, nil
1178
1179	case *query.BranchesRepos:
1180		reposBranchesWant := make([]uint64, len(d.repoMetaData))
1181		for repoIdx := range d.repoMetaData {
1182			var mask uint64
1183			for _, br := range s.List {
1184				if br.Repos.Contains(d.repoMetaData[repoIdx].ID) {
1185					mask |= uint64(d.branchIDs[repoIdx][br.Branch])
1186				}
1187			}
1188			reposBranchesWant[repoIdx] = mask
1189		}
1190		return &docMatchTree{
1191			reason:  "BranchesRepos",
1192			numDocs: d.numDocs(),
1193			predicate: func(docID uint32) bool {
1194				return d.fileBranchMasks[docID]&reposBranchesWant[d.repos[docID]] != 0
1195			},
1196		}, nil
1197
1198	case *query.RepoSet:
1199		reposWant := make([]bool, len(d.repoMetaData))
1200		for repoIdx, r := range d.repoMetaData {
1201			if _, ok := s.Set[r.Name]; ok {
1202				reposWant[repoIdx] = true
1203			}
1204		}
1205		return &docMatchTree{
1206			reason:  "RepoSet",
1207			numDocs: d.numDocs(),
1208			predicate: func(docID uint32) bool {
1209				return reposWant[d.repos[docID]]
1210			},
1211		}, nil
1212
1213	case *query.RepoIDs:
1214		reposWant := make([]bool, len(d.repoMetaData))
1215		for repoIdx, r := range d.repoMetaData {
1216			if s.Repos.Contains(r.ID) {
1217				reposWant[repoIdx] = true
1218			}
1219		}
1220		return &docMatchTree{
1221			reason:  "RepoIDs",
1222			numDocs: d.numDocs(),
1223			predicate: func(docID uint32) bool {
1224				return reposWant[d.repos[docID]]
1225			},
1226		}, nil
1227
1228	case *query.Repo:
1229		reposWant := make([]bool, len(d.repoMetaData))
1230		for repoIdx, r := range d.repoMetaData {
1231			if s.Regexp.MatchString(r.Name) {
1232				reposWant[repoIdx] = true
1233			}
1234		}
1235		return &docMatchTree{
1236			reason:  "Repo",
1237			numDocs: d.numDocs(),
1238			predicate: func(docID uint32) bool {
1239				return reposWant[d.repos[docID]]
1240			},
1241		}, nil
1242
1243	case *query.RepoRegexp:
1244		reposWant := make([]bool, len(d.repoMetaData))
1245		for repoIdx, r := range d.repoMetaData {
1246			if s.Regexp.MatchString(r.Name) {
1247				reposWant[repoIdx] = true
1248			}
1249		}
1250		return &docMatchTree{
1251			reason:  "RepoRegexp",
1252			numDocs: d.numDocs(),
1253			predicate: func(docID uint32) bool {
1254				return reposWant[d.repos[docID]]
1255			},
1256		}, nil
1257
1258	case query.RawConfig:
1259		return &docMatchTree{
1260			reason:  s.String(),
1261			numDocs: d.numDocs(),
1262			predicate: func(docID uint32) bool {
1263				return uint8(s)&d.rawConfigMasks[d.repos[docID]] == uint8(s)
1264			},
1265		}, nil
1266	}
1267	log.Panicf("type %T", q)
1268	return nil, nil
1269}
1270
1271func (d *indexData) newSubstringMatchTree(s *query.Substring) (matchTree, error) {
1272	st := &substrMatchTree{
1273		query:         s,
1274		caseSensitive: s.CaseSensitive,
1275		fileName:      s.FileName,
1276	}
1277
1278	if utf8.RuneCountInString(s.Pattern) < ngramSize {
1279		return newRegexpMatchTree(&query.Regexp{
1280			Regexp:        &syntax.Regexp{Op: syntax.OpLiteral, Rune: []rune(s.Pattern)},
1281			FileName:      s.FileName,
1282			Content:       s.Content,
1283			CaseSensitive: s.CaseSensitive,
1284		}), nil
1285	}
1286
1287	result, err := d.iterateNgrams(s)
1288	if err != nil {
1289		return nil, err
1290	}
1291	st.matchIterator = result
1292	return st, nil
1293}
1294
1295func regexpToWordMatchTree(q *query.Regexp, opt matchTreeOpt) (_ *wordMatchTree, ok bool) {
1296	if opt.DisableWordMatchOptimization {
1297		return nil, false
1298	}
1299	// Needs to be case sensitive
1300	if !q.CaseSensitive || q.Regexp.Flags&syntax.FoldCase != 0 {
1301		return nil, false
1302	}
1303	// We want a regex that looks like Op.Concat[OpWordBoundary OpLiteral OpWordBoundary]
1304	if q.Regexp.Op != syntax.OpConcat || len(q.Regexp.Sub) != 3 {
1305		return nil, false
1306	}
1307	sub := q.Regexp.Sub
1308	if sub[0].Op != syntax.OpWordBoundary || sub[1].Op != syntax.OpLiteral || sub[2].Op != syntax.OpWordBoundary {
1309		return nil, false
1310	}
1311
1312	return &wordMatchTree{
1313		word:     string(sub[1].Rune),
1314		fileName: q.FileName,
1315	}, true
1316}
1317
1318// pruneMatchTree removes impossible branches from the matchTree, as indicated
1319// by substrMatchTree having a noMatchTree and the resulting impossible and clauses and so forth.
1320func pruneMatchTree(mt matchTree) (matchTree, error) {
1321	var err error
1322	switch mt := mt.(type) {
1323	// leaf nodes that we test with the filter:
1324	case *substrMatchTree:
1325		if res, ok := mt.matchIterator.(*ngramIterationResults); ok {
1326			if _, ok := res.matchIterator.(*noMatchTree); ok {
1327				return nil, nil
1328			}
1329		}
1330	// recursive tree structures:
1331	case *andMatchTree:
1332		// Any branch of an and becoming impossible means the entire clause
1333		// is impossible. Otherwise, just handle rewrites.
1334		for i, child := range mt.children {
1335			newChild, err := pruneMatchTree(child)
1336			if err != nil {
1337				return nil, err
1338			}
1339			if newChild == nil {
1340				return nil, nil
1341			}
1342			mt.children[i] = newChild
1343		}
1344	case *orMatchTree:
1345		// *All* branches of an OR becoming impossible means the entire clause
1346		// is impossible. Otherwise, drop impossible subclauses and handle
1347		// rewrites, including simplifying to a singular resulting child branch.
1348		n := 0
1349		for _, child := range mt.children {
1350			newChild, err := pruneMatchTree(child)
1351			if err != nil {
1352				return nil, err
1353			}
1354			if newChild != nil {
1355				mt.children[n] = newChild
1356				n++
1357			}
1358		}
1359		mt.children = mt.children[:n]
1360		if len(mt.children) == 1 {
1361			return mt.children[0], nil
1362		} else if len(mt.children) == 0 {
1363			return nil, nil
1364		}
1365	case *noVisitMatchTree:
1366		mt.matchTree, err = pruneMatchTree(mt.matchTree)
1367		if err != nil {
1368			return nil, err
1369		}
1370		if mt.matchTree == nil {
1371			return nil, nil
1372		}
1373	case *fileNameMatchTree:
1374		mt.child, err = pruneMatchTree(mt.child)
1375		if err != nil {
1376			return nil, err
1377		}
1378		if mt.child == nil {
1379			return nil, nil
1380		}
1381	case *boostMatchTree:
1382		mt.child, err = pruneMatchTree(mt.child)
1383		if err != nil {
1384			return nil, err
1385		}
1386		if mt.child == nil {
1387			return nil, nil
1388		}
1389	case *andLineMatchTree:
1390		child, err := pruneMatchTree(&mt.andMatchTree)
1391		if err != nil {
1392			return nil, err
1393		}
1394		if child == nil {
1395			return nil, nil
1396		}
1397		if c, ok := child.(*andMatchTree); ok {
1398			mt.andMatchTree = *c
1399		} else {
1400			// the and was simplified to a single clause,
1401			// so the linematch portion is irrelevant.
1402			return mt, nil
1403		}
1404	case *notMatchTree:
1405		mt.child, err = pruneMatchTree(mt.child)
1406		if err != nil {
1407			return nil, err
1408		}
1409		if mt.child == nil {
1410			// not false => true
1411			return &bruteForceMatchTree{}, nil
1412		}
1413	// unhandled:
1414	case *docMatchTree:
1415	case *bruteForceMatchTree:
1416	case *regexpMatchTree:
1417	case *wordMatchTree:
1418	}
1419	return mt, err
1420}
1421
1422// isRegexpAll returns true if the query matches all possible lines.
1423//
1424// Note: it is possible for a funky regex to actually match all but this
1425// returns false. This returns true for normal looking regexes like ".*" or
1426// "(?-s:.*)".
1427func isRegexpAll(r *syntax.Regexp) bool {
1428	// Note: we don't care about any flags since we are looking for .* and we
1429	// don't mind if . matches all or everything but newline.
1430
1431	// for loop is for visiting children of OpCapture/OpConcat until we hit .*
1432	for {
1433		// Our main target: .*
1434		if r.Op == syntax.OpStar && len(r.Sub) == 1 { // *
1435			// (?s:.) or (?-s:.)
1436			return r.Sub[0].Op == syntax.OpAnyChar || r.Sub[0].Op == syntax.OpAnyCharNotNL
1437		}
1438
1439		// Strip away expressions being wrapped in paranthesis
1440		if (r.Op == syntax.OpCapture || r.Op == syntax.OpConcat) && len(r.Sub) == 1 {
1441			r = r.Sub[0]
1442			continue
1443		}
1444
1445		return false
1446	}
1447}
1448
1449func queryMetaChecksum(field string, value *regexp.Regexp) string {
1450	h := xxhash.New()
1451	h.Write([]byte(field))
1452	h.Write([]byte{':'})
1453	h.Write([]byte(value.String()))
1454	return fmt.Sprintf("%x", h.Sum64())
1455}
Configure Feed

Configure Feed