index/matchtree.go at b97d734760cec3ddde795683731e48a9e674ebb7 · boltless.me/zoekt

fork of https://github.com/sourcegraph/zoekt
zoekt / index / matchtree.go
at b97d734760cec3ddde795683731e48a9e674ebb7 34 kB View raw
Julie Tibshirani Move root-level index code to index package (#902) 1y ago
   1// Copyright 2018 Google Inc. All rights reserved.
   2//
   3// Licensed under the Apache License, Version 2.0 (the "License");
   4// you may not use this file except in compliance with the License.
   5// You may obtain a copy of the License at
   6//
   7//    http://www.apache.org/licenses/LICENSE-2.0
   8//
   9// Unless required by applicable law or agreed to in writing, software
  10// distributed under the License is distributed on an "AS IS" BASIS,
  11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12// See the License for the specific language governing permissions and
  13// limitations under the License.
  14
  15package index
  16
  17import (
  18	"bytes"
  19	"fmt"
  20	"log"
  21	"regexp/syntax"
  22	"strings"
  23	"unicode/utf8"
  24
  25	"github.com/grafana/regexp"
  26	"github.com/sourcegraph/zoekt"
  27	"github.com/sourcegraph/zoekt/internal/syntaxutil"
  28	"github.com/sourcegraph/zoekt/query"
  29)
  30
  31// A docIterator iterates over documents in order.
  32type docIterator interface {
  33	// provide the next document where we may find something interesting.
  34	//
  35	// This is like a "peek" and shouldn't mutate state. prepare is what should
  36	// change state.
  37	nextDoc() uint32
  38
  39	// clears any per-document state of the docIterator, and
  40	// prepares for evaluating the given doc. The argument is
  41	// strictly increasing over time.
  42	prepare(nextDoc uint32)
  43}
  44
  45// costs are passed in increasing order to matchTree.matches until they do not
  46// return matchesRequiresHigherCost.
  47const (
  48	costConst   = 0
  49	costMemory  = 1
  50	costContent = 2
  51	costRegexp  = 3
  52)
  53
  54const (
  55	costMin = costConst
  56	costMax = costRegexp
  57)
  58
  59// matchesState is an enum for the state of a matchTree after a call to
  60// matchTree.matches.
  61type matchesState uint8
  62
  63const (
  64	// matchesRequiresHigherCost is returned when matchTree.matches hasn't done
  65	// a search yet since the cost value is not high enough.
  66	matchesRequiresHigherCost matchesState = iota
  67
  68	// matchesFound is returned when matchTree.matches has done a search and
  69	// found one or more matches.
  70	matchesFound
  71
  72	// matchesNone is returned when matchTree.matches has done a search and
  73	// found nothing.
  74	matchesNone
  75)
  76
  77// matchesStatePred is a helper which returns matchesFound if b is true
  78// otherwise returns matchesNone.
  79func matchesStatePred(b bool) matchesState {
  80	if b {
  81		return matchesFound
  82	}
  83	return matchesNone
  84}
  85
  86// matchesStateForSlice is a helper which returns matchesFound if v is
  87// non-empty otherwise returns matchesNone.
  88func matchesStateForSlice[T any](v []T) matchesState {
  89	return matchesStatePred(len(v) > 0)
  90}
  91
  92// An expression tree coupled with matches. The matchtree has two
  93// functions:
  94//
  95// * it implements boolean combinations (and, or, not)
  96//
  97// * it implements shortcuts, where we skip documents (for example: if
  98// there are no trigram matches, we can be sure there are no substring
  99// matches). The matchtree iterates over the documents as they are
 100// ordered in the shard.
 101//
 102// The general process for a given (shard, query) is:
 103//
 104// - construct matchTree for the query
 105//
 106// - find all different leaf matchTrees (substring, regexp, etc.)
 107//
 108// in a loop:
 109//
 110//   - find next doc to process using nextDoc
 111//
 112//   - evaluate atoms (leaf expressions that match text)
 113//
 114//   - evaluate the tree using matches(), storing the result in map.
 115//
 116//   - if the complete tree returns (matches() != matchesRequiresHigherCost)
 117//     for the document, collect all text matches by looking at leaf
 118//     matchTrees.
 119type matchTree interface {
 120	docIterator
 121
 122	// matches if cost is high enough. See documentation for matchesState's
 123	// values.
 124	//
 125	// Note: Do not call this directly, rather use evalMatchTree which uses
 126	// known to cache responses once the state transitions away from
 127	// matchesRequiresHigherCost.
 128	matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState
 129}
 130
 131// docMatchTree iterates over documents for which predicate(docID) returns true.
 132type docMatchTree struct {
 133	// the number of documents in a shard.
 134	numDocs uint32
 135
 136	predicate func(docID uint32) bool
 137
 138	// provides additional information about the reason why the docMatchTree was
 139	// created.
 140	reason string
 141
 142	// mutable
 143	firstDone bool
 144	docID     uint32
 145}
 146
 147type bruteForceMatchTree struct {
 148	// mutable
 149	firstDone bool
 150	docID     uint32
 151}
 152
 153type andLineMatchTree struct {
 154	andMatchTree
 155}
 156
 157type andMatchTree struct {
 158	children []matchTree
 159}
 160
 161type orMatchTree struct {
 162	children []matchTree
 163}
 164
 165type notMatchTree struct {
 166	child matchTree
 167}
 168
 169// Returns only the filename of child matches.
 170type fileNameMatchTree struct {
 171	child matchTree
 172}
 173
 174type boostMatchTree struct {
 175	child matchTree
 176	boost float64
 177}
 178
 179// Don't visit this subtree for collecting matches.
 180type noVisitMatchTree struct {
 181	matchTree
 182}
 183
 184type regexpMatchTree struct {
 185	regexp *regexp.Regexp
 186
 187	// origRegexp is the original parsed regexp from the query structure. It
 188	// does not include mutations such as case sensitivity.
 189	origRegexp *syntax.Regexp
 190
 191	fileName bool
 192
 193	// mutable
 194	reEvaluated bool
 195	found       []*candidateMatch
 196
 197	// nextDoc, prepare.
 198	bruteForceMatchTree
 199}
 200
 201func newRegexpMatchTree(s *query.Regexp) *regexpMatchTree {
 202	prefix := ""
 203	if !s.CaseSensitive {
 204		prefix = "(?i)"
 205	}
 206
 207	return &regexpMatchTree{
 208		regexp:     regexp.MustCompile(prefix + syntaxutil.RegexpString(s.Regexp)),
 209		origRegexp: s.Regexp,
 210		fileName:   s.FileName,
 211	}
 212}
 213
 214// \bLITERAL\b
 215type wordMatchTree struct {
 216	word string
 217
 218	fileName bool
 219
 220	// mutable
 221	evaluated bool
 222	found     []*candidateMatch
 223
 224	// nextDoc, prepare.
 225	bruteForceMatchTree
 226}
 227
 228type substrMatchTree struct {
 229	matchIterator
 230
 231	query         *query.Substring
 232	caseSensitive bool
 233	fileName      bool
 234
 235	// mutable
 236	current       []*candidateMatch
 237	contEvaluated bool
 238}
 239
 240type branchQueryMatchTree struct {
 241	fileMasks []uint64
 242	masks     []uint64
 243	repos     []uint16
 244
 245	// mutable
 246	firstDone bool
 247	docID     uint32
 248}
 249
 250func (t *branchQueryMatchTree) branchMask() uint64 {
 251	return t.fileMasks[t.docID] & t.masks[t.repos[t.docID]]
 252}
 253
 254type symbolRegexpMatchTree struct {
 255	matchTree
 256	regexp *regexp.Regexp
 257	all    bool // skips regex match if .*
 258
 259	reEvaluated bool
 260	found       []*candidateMatch
 261}
 262
 263func (t *symbolRegexpMatchTree) prepare(doc uint32) {
 264	t.reEvaluated = false
 265	t.found = t.found[:0]
 266	t.matchTree.prepare(doc)
 267}
 268
 269func (t *symbolRegexpMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
 270	if t.reEvaluated {
 271		return matchesStateForSlice(t.found)
 272	}
 273
 274	if cost < costRegexp {
 275		return matchesRequiresHigherCost
 276	}
 277
 278	sections := cp.docSections()
 279	content := cp.data(false)
 280
 281	found := t.found[:0]
 282	for i, sec := range sections {
 283		var idx []int
 284		if t.all {
 285			idx = []int{0, int(sec.End - sec.Start)}
 286		} else {
 287			idx = t.regexp.FindIndex(content[sec.Start:sec.End])
 288			if idx == nil {
 289				continue
 290			}
 291		}
 292
 293		cm := &candidateMatch{
 294			byteOffset:  sec.Start + uint32(idx[0]),
 295			byteMatchSz: uint32(idx[1] - idx[0]),
 296			symbol:      true,
 297			symbolIdx:   uint32(i),
 298		}
 299		found = append(found, cm)
 300	}
 301	t.found = found
 302	t.reEvaluated = true
 303
 304	return matchesStateForSlice(t.found)
 305}
 306
 307type symbolSubstrMatchTree struct {
 308	*substrMatchTree
 309
 310	patternSize   uint32
 311	fileEndRunes  []uint32
 312	fileEndSymbol []uint32
 313
 314	doc      uint32
 315	sections []DocumentSection
 316
 317	secID uint32
 318}
 319
 320func (t *symbolSubstrMatchTree) prepare(doc uint32) {
 321	t.substrMatchTree.prepare(doc)
 322	t.doc = doc
 323
 324	var fileStart uint32
 325	if doc > 0 {
 326		fileStart = t.fileEndRunes[doc-1]
 327	}
 328
 329	var sections []DocumentSection
 330	if len(t.sections) > 0 {
 331		most := t.fileEndSymbol[len(t.fileEndSymbol)-1]
 332		if most == uint32(len(t.sections)) {
 333			sections = t.sections[t.fileEndSymbol[doc]:t.fileEndSymbol[doc+1]]
 334		} else {
 335			for t.secID < uint32(len(t.sections)) && t.sections[t.secID].Start < fileStart {
 336				t.secID++
 337			}
 338
 339			fileEnd, symbolEnd := t.fileEndRunes[doc], t.secID
 340			for symbolEnd < uint32(len(t.sections)) && t.sections[symbolEnd].Start < fileEnd {
 341				symbolEnd++
 342			}
 343
 344			sections = t.sections[t.secID:symbolEnd]
 345		}
 346	}
 347
 348	secIdx := 0
 349	trimmed := t.current[:0]
 350	for len(sections) > secIdx && len(t.current) > 0 {
 351		start := fileStart + t.current[0].runeOffset
 352		end := start + t.patternSize
 353		if start >= sections[secIdx].End {
 354			secIdx++
 355			continue
 356		}
 357
 358		if start < sections[secIdx].Start {
 359			t.current = t.current[1:]
 360			continue
 361		}
 362
 363		if end <= sections[secIdx].End {
 364			t.current[0].symbol = true
 365			t.current[0].symbolIdx = uint32(secIdx)
 366			trimmed = append(trimmed, t.current[0])
 367		}
 368
 369		t.current = t.current[1:]
 370	}
 371	t.current = trimmed
 372}
 373
 374// all prepare methods
 375
 376func (t *bruteForceMatchTree) prepare(doc uint32) {
 377	t.docID = doc
 378	t.firstDone = true
 379}
 380
 381func (t *docMatchTree) prepare(doc uint32) {
 382	t.docID = doc
 383	t.firstDone = true
 384}
 385
 386func (t *andMatchTree) prepare(doc uint32) {
 387	for _, c := range t.children {
 388		c.prepare(doc)
 389	}
 390}
 391
 392func (t *regexpMatchTree) prepare(doc uint32) {
 393	t.found = t.found[:0]
 394	t.reEvaluated = false
 395	t.bruteForceMatchTree.prepare(doc)
 396}
 397
 398func (t *wordMatchTree) prepare(doc uint32) {
 399	t.found = t.found[:0]
 400	t.evaluated = false
 401	t.bruteForceMatchTree.prepare(doc)
 402}
 403
 404func (t *orMatchTree) prepare(doc uint32) {
 405	for _, c := range t.children {
 406		c.prepare(doc)
 407	}
 408}
 409
 410func (t *notMatchTree) prepare(doc uint32) {
 411	t.child.prepare(doc)
 412}
 413
 414func (t *fileNameMatchTree) prepare(doc uint32) {
 415	t.child.prepare(doc)
 416}
 417
 418func (t *boostMatchTree) prepare(doc uint32) {
 419	t.child.prepare(doc)
 420}
 421
 422func (t *substrMatchTree) prepare(nextDoc uint32) {
 423	t.matchIterator.prepare(nextDoc)
 424	t.current = t.matchIterator.candidates()
 425	t.contEvaluated = false
 426}
 427
 428func (t *branchQueryMatchTree) prepare(doc uint32) {
 429	t.firstDone = true
 430	t.docID = doc
 431}
 432
 433// nextDoc
 434
 435func (t *docMatchTree) nextDoc() uint32 {
 436	var start uint32
 437	if t.firstDone {
 438		start = t.docID + 1
 439	}
 440	for i := start; i < t.numDocs; i++ {
 441		if t.predicate(i) {
 442			return i
 443		}
 444	}
 445	return maxUInt32
 446}
 447
 448func (t *bruteForceMatchTree) nextDoc() uint32 {
 449	if !t.firstDone {
 450		return 0
 451	}
 452	return t.docID + 1
 453}
 454
 455func (t *andMatchTree) nextDoc() uint32 {
 456	var max uint32
 457	for _, c := range t.children {
 458		m := c.nextDoc()
 459		if m > max {
 460			max = m
 461		}
 462	}
 463	return max
 464}
 465
 466func (t *orMatchTree) nextDoc() uint32 {
 467	min := uint32(maxUInt32)
 468	for _, c := range t.children {
 469		m := c.nextDoc()
 470		if m < min {
 471			min = m
 472		}
 473	}
 474	return min
 475}
 476
 477func (t *notMatchTree) nextDoc() uint32 {
 478	return 0
 479}
 480
 481func (t *fileNameMatchTree) nextDoc() uint32 {
 482	return t.child.nextDoc()
 483}
 484
 485func (t *boostMatchTree) nextDoc() uint32 {
 486	return t.child.nextDoc()
 487}
 488
 489func (t *branchQueryMatchTree) nextDoc() uint32 {
 490	var start uint32
 491	if t.firstDone {
 492		start = t.docID + 1
 493	}
 494
 495	for i := start; i < uint32(len(t.fileMasks)); i++ {
 496		if (t.masks[t.repos[i]] & t.fileMasks[i]) != 0 {
 497			return i
 498		}
 499	}
 500	return maxUInt32
 501}
 502
 503// all String methods
 504
 505func (t *bruteForceMatchTree) String() string {
 506	return "all"
 507}
 508
 509func (t *docMatchTree) String() string {
 510	return fmt.Sprintf("doc(%s)", t.reason)
 511}
 512
 513func (t *andMatchTree) String() string {
 514	return fmt.Sprintf("and%v", t.children)
 515}
 516
 517func (t *regexpMatchTree) String() string {
 518	f := ""
 519	if t.fileName {
 520		f = "f"
 521	}
 522	return fmt.Sprintf("%sre(%s)", f, t.regexp)
 523}
 524
 525func (t *wordMatchTree) String() string {
 526	f := ""
 527	if t.fileName {
 528		f = "f"
 529	}
 530	return fmt.Sprintf("%sword(%s)", f, t.word)
 531}
 532
 533func (t *orMatchTree) String() string {
 534	return fmt.Sprintf("or%v", t.children)
 535}
 536
 537func (t *notMatchTree) String() string {
 538	return fmt.Sprintf("not(%v)", t.child)
 539}
 540
 541func (t *noVisitMatchTree) String() string {
 542	return fmt.Sprintf("novisit(%v)", t.matchTree)
 543}
 544
 545func (t *fileNameMatchTree) String() string {
 546	return fmt.Sprintf("f(%v)", t.child)
 547}
 548
 549func (t *boostMatchTree) String() string {
 550	return fmt.Sprintf("boost(%f, %v)", t.boost, t.child)
 551}
 552
 553func (t *substrMatchTree) String() string {
 554	f := ""
 555	if t.fileName {
 556		f = "f"
 557	}
 558
 559	return fmt.Sprintf("%ssubstr(%q, %v, %v)", f, t.query.Pattern, t.current, t.matchIterator)
 560}
 561
 562func (t *branchQueryMatchTree) String() string {
 563	return fmt.Sprintf("branch(%x)", t.masks)
 564}
 565
 566func (t *symbolSubstrMatchTree) String() string {
 567	return fmt.Sprintf("symbol(%v)", t.substrMatchTree)
 568}
 569
 570func (t *symbolRegexpMatchTree) String() string {
 571	return fmt.Sprintf("symbol(%v)", t.matchTree)
 572}
 573
 574// visitMatches visits all atoms in matchTree. Note: This visits
 575// noVisitMatchTree. For collecting matches use visitMatches.
 576func visitMatchTree(t matchTree, f func(matchTree)) {
 577	switch s := t.(type) {
 578	case *andMatchTree:
 579		for _, ch := range s.children {
 580			visitMatchTree(ch, f)
 581		}
 582	case *orMatchTree:
 583		for _, ch := range s.children {
 584			visitMatchTree(ch, f)
 585		}
 586	case *andLineMatchTree:
 587		visitMatchTree(&s.andMatchTree, f)
 588	case *noVisitMatchTree:
 589		visitMatchTree(s.matchTree, f)
 590	case *notMatchTree:
 591		visitMatchTree(s.child, f)
 592	case *fileNameMatchTree:
 593		visitMatchTree(s.child, f)
 594	case *boostMatchTree:
 595		visitMatchTree(s.child, f)
 596	case *symbolSubstrMatchTree:
 597		visitMatchTree(s.substrMatchTree, f)
 598	case *symbolRegexpMatchTree:
 599		visitMatchTree(s.matchTree, f)
 600	default:
 601		f(t)
 602	}
 603}
 604
 605// updateMatchTreeStats calls updateStats on all atoms in mt which have that
 606// function defined.
 607func updateMatchTreeStats(mt matchTree, stats *zoekt.Stats) {
 608	visitMatchTree(mt, func(mt matchTree) {
 609		if atom, ok := mt.(interface{ updateStats(*zoekt.Stats) }); ok {
 610			atom.updateStats(stats)
 611		}
 612	})
 613}
 614
 615func visitMatchAtoms(t matchTree, known map[matchTree]bool, f func(matchTree)) {
 616	visitMatches(t, known, 1, func(mt matchTree, _ float64) {
 617		f(mt)
 618	})
 619}
 620
 621// visitMatches visits all atoms which can contribute matches. Note: This
 622// skips noVisitMatchTree.
 623func visitMatches(t matchTree, known map[matchTree]bool, weight float64, f func(matchTree, float64)) {
 624	switch s := t.(type) {
 625	case *andMatchTree:
 626		for _, ch := range s.children {
 627			if known[ch] {
 628				visitMatches(ch, known, weight, f)
 629			}
 630		}
 631	case *andLineMatchTree:
 632		visitMatches(&s.andMatchTree, known, weight, f)
 633	case *orMatchTree:
 634		for _, ch := range s.children {
 635			if known[ch] {
 636				visitMatches(ch, known, weight, f)
 637			}
 638		}
 639	case *boostMatchTree:
 640		visitMatches(s.child, known, weight*s.boost, f)
 641	case *symbolSubstrMatchTree:
 642		visitMatches(s.substrMatchTree, known, weight, f)
 643	case *notMatchTree:
 644	case *noVisitMatchTree:
 645		// don't collect into negative trees.
 646	case *fileNameMatchTree:
 647		// We will just gather the filename if we do not visit this tree.
 648	default:
 649		f(s, weight)
 650	}
 651}
 652
 653// all matches() methods.
 654
 655func (t *docMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
 656	return matchesStatePred(t.predicate(cp.idx))
 657}
 658
 659func (t *bruteForceMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
 660	return matchesFound
 661}
 662
 663// andLineMatchTree is a performance optimization of andMatchTree. For content
 664// searches we don't want to run the regex engine if there is no line that
 665// contains matches from all terms.
 666func (t *andLineMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
 667	if state := evalMatchTree(cp, cost, known, &t.andMatchTree); state != matchesFound {
 668		return state
 669	}
 670
 671	// Invariant: all children have matches. If any line contains all of them we
 672	// can return MatchesFound.
 673
 674	// find child with fewest candidates
 675	min := maxUInt32
 676	fewestChildren := 0
 677	for ix, child := range t.children {
 678		v, ok := child.(*substrMatchTree)
 679		// make sure we are running a content search and that all candidates are a
 680		// substrMatchTree
 681		if !ok || v.fileName {
 682			return matchesFound
 683		}
 684		if len(v.current) < min {
 685			min = len(v.current)
 686			fewestChildren = ix
 687		}
 688	}
 689
 690	type lineRange struct {
 691		start int
 692		end   int
 693	}
 694	lines := make([]lineRange, 0, len(t.children[fewestChildren].(*substrMatchTree).current))
 695	prev := -1
 696	for _, candidate := range t.children[fewestChildren].(*substrMatchTree).current {
 697		line := cp.newlines().atOffset(candidate.byteOffset)
 698		if line == prev {
 699			continue
 700		}
 701		prev = line
 702		byteStart := int(cp.newlines().lineStart(line))
 703		byteEnd := int(cp.newlines().lineStart(line + 1))
 704		lines = append(lines, lineRange{byteStart, byteEnd})
 705	}
 706
 707	// children keeps track of the children's candidates we have already seen.
 708	children := make([][]*candidateMatch, 0, len(t.children)-1)
 709	for j, child := range t.children {
 710		if j == fewestChildren {
 711			continue
 712		}
 713		children = append(children, child.(*substrMatchTree).current)
 714	}
 715
 716nextLine:
 717	for i := 0; i < len(lines); i++ {
 718		hits := 1
 719	nextChild:
 720		for j := range children {
 721		nextCandidate:
 722			for len(children[j]) > 0 {
 723				candidate := children[j][0]
 724				bo := int(cp.findOffset(false, candidate.runeOffset))
 725				if bo < lines[i].start {
 726					children[j] = children[j][1:]
 727					continue nextCandidate
 728				}
 729				if bo < lines[i].end {
 730					hits++
 731					continue nextChild
 732				}
 733				// move the `lines` iterator forward until bo < line.end
 734				for i < len(lines) && bo >= lines[i].end {
 735					i++
 736				}
 737				i--
 738				continue nextLine
 739			}
 740		}
 741		// return early once we found any line that contains matches from all children
 742		if hits == len(t.children) {
 743			return matchesFound
 744		}
 745	}
 746	return matchesNone
 747}
 748
 749func (t *andMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
 750	// We have found matches unless a child needs to do more work or it hasn't
 751	// found matches.
 752	state := matchesFound
 753
 754	for _, ch := range t.children {
 755		switch evalMatchTree(cp, cost, known, ch) {
 756		case matchesRequiresHigherCost:
 757			// keep evaluating other children incase we come across matchesNone
 758			state = matchesRequiresHigherCost
 759		case matchesFound:
 760			// will return this if every child has this value
 761		case matchesNone:
 762			return matchesNone
 763		}
 764	}
 765
 766	return state
 767}
 768
 769func (t *orMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
 770	// we could short-circuit, but we want to use the other possibilities as a
 771	// ranking signal. So we always return the most conservative state.
 772	state := matchesNone
 773	for _, ch := range t.children {
 774		switch evalMatchTree(cp, cost, known, ch) {
 775		case matchesRequiresHigherCost:
 776			state = matchesRequiresHigherCost
 777		case matchesFound:
 778			if state != matchesRequiresHigherCost {
 779				state = matchesFound
 780			}
 781		case matchesNone:
 782			// noop
 783		}
 784	}
 785	return state
 786}
 787
 788func (t *branchQueryMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
 789	return matchesStatePred(t.branchMask() != 0)
 790}
 791
 792func (t *regexpMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
 793	if t.reEvaluated {
 794		return matchesStateForSlice(t.found)
 795	}
 796
 797	if cost < costRegexp {
 798		return matchesRequiresHigherCost
 799	}
 800
 801	cp.stats.RegexpsConsidered++
 802	idxs := t.regexp.FindAllIndex(cp.data(t.fileName), -1)
 803	found := t.found[:0]
 804	for _, idx := range idxs {
 805		cm := &candidateMatch{
 806			byteOffset:  uint32(idx[0]),
 807			byteMatchSz: uint32(idx[1] - idx[0]),
 808			fileName:    t.fileName,
 809		}
 810
 811		found = append(found, cm)
 812	}
 813	t.found = found
 814	t.reEvaluated = true
 815
 816	return matchesStateForSlice(t.found)
 817}
 818
 819func (t *wordMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
 820	if t.evaluated {
 821		return matchesStateForSlice(t.found)
 822	}
 823
 824	if cost < costRegexp {
 825		return matchesRequiresHigherCost
 826	}
 827
 828	data := cp.data(t.fileName)
 829	offset := 0
 830	found := t.found[:0]
 831	for {
 832		idx := bytes.Index(data[offset:], []byte(t.word))
 833		if idx < 0 {
 834			break
 835		}
 836
 837		relStartOffset := offset + idx
 838		relEndOffset := relStartOffset + len(t.word)
 839
 840		startBoundary := relStartOffset < len(data) && (relStartOffset == 0 || !characterClass(data[relStartOffset-1]))
 841		endBoundary := relEndOffset > 0 && (relEndOffset == len(data) || !characterClass(data[relEndOffset]))
 842		if startBoundary && endBoundary {
 843			found = append(found, &candidateMatch{
 844				byteOffset:  uint32(offset + idx),
 845				byteMatchSz: uint32(len(t.word)),
 846				fileName:    t.fileName,
 847			})
 848		}
 849		offset += idx + len(t.word)
 850	}
 851
 852	t.found = found
 853	t.evaluated = true
 854
 855	return matchesStateForSlice(t.found)
 856}
 857
 858// breakMatchesOnNewlines returns matches resulting from breaking each element
 859// of cms on newlines within text.
 860func breakMatchesOnNewlines(cms []*candidateMatch, text []byte) []*candidateMatch {
 861	var lineCMs []*candidateMatch
 862	for _, cm := range cms {
 863		lineCMs = append(lineCMs, breakOnNewlines(cm, text)...)
 864	}
 865	return lineCMs
 866}
 867
 868// breakOnNewlines returns matches resulting from breaking cm on newlines
 869// within text.
 870func breakOnNewlines(cm *candidateMatch, text []byte) []*candidateMatch {
 871	var cms []*candidateMatch
 872	addMe := &candidateMatch{}
 873	*addMe = *cm
 874	for i := uint32(cm.byteOffset); i < cm.byteOffset+cm.byteMatchSz; i++ {
 875		if text[i] == '\n' {
 876			addMe.byteMatchSz = i - addMe.byteOffset
 877			if addMe.byteMatchSz != 0 {
 878				cms = append(cms, addMe)
 879			}
 880
 881			addMe = &candidateMatch{}
 882			*addMe = *cm
 883			addMe.byteOffset = i + 1
 884		}
 885	}
 886	addMe.byteMatchSz = cm.byteOffset + cm.byteMatchSz - addMe.byteOffset
 887	if addMe.byteMatchSz != 0 {
 888		cms = append(cms, addMe)
 889	}
 890	return cms
 891}
 892
 893// evalMatchTree should be called instead of directly calling
 894// matchTree.matches. It cache known values for future evaluation at higher
 895// costs.
 896func evalMatchTree(cp *contentProvider, cost int, known map[matchTree]bool, mt matchTree) matchesState {
 897	if v, ok := known[mt]; ok {
 898		return matchesStatePred(v)
 899	}
 900
 901	ms := mt.matches(cp, cost, known)
 902	if ms != matchesRequiresHigherCost {
 903		known[mt] = ms == matchesFound
 904	}
 905
 906	return ms
 907}
 908
 909func (t *notMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
 910	switch evalMatchTree(cp, cost, known, t.child) {
 911	case matchesRequiresHigherCost:
 912		return matchesRequiresHigherCost
 913	case matchesFound:
 914		return matchesNone
 915	case matchesNone:
 916		return matchesFound
 917	default:
 918		panic("unreachable")
 919	}
 920}
 921
 922func (t *fileNameMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
 923	return evalMatchTree(cp, cost, known, t.child)
 924}
 925
 926func (t *boostMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
 927	return evalMatchTree(cp, cost, known, t.child)
 928}
 929
 930func (t *substrMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
 931	if t.contEvaluated {
 932		return matchesStateForSlice(t.current)
 933	}
 934
 935	if len(t.current) == 0 {
 936		return matchesNone
 937	}
 938
 939	if t.fileName && cost < costMemory {
 940		return matchesRequiresHigherCost
 941	}
 942
 943	if !t.fileName && cost < costContent {
 944		return matchesRequiresHigherCost
 945	}
 946
 947	pruned := t.current[:0]
 948	for _, m := range t.current {
 949		if m.byteOffset == 0 && m.runeOffset > 0 {
 950			m.byteOffset = cp.findOffset(m.fileName, m.runeOffset)
 951		}
 952		if m.matchContent(cp.data(m.fileName)) {
 953			pruned = append(pruned, m)
 954		}
 955	}
 956	t.current = pruned
 957	t.contEvaluated = true
 958
 959	return matchesStateForSlice(t.current)
 960}
 961
 962type matchTreeOpt struct {
 963	// DisableWordMatchOptimization is used to disable the use of wordMatchTree.
 964	// This was added since we do not support wordMatchTree with symbol search.
 965	DisableWordMatchOptimization bool
 966}
 967
 968func (d *indexData) newMatchTree(q query.Q, opt matchTreeOpt) (matchTree, error) {
 969	if q == nil {
 970		return nil, fmt.Errorf("got nil (sub)query")
 971	}
 972	switch s := q.(type) {
 973	case *query.Regexp:
 974		// RegexpToMatchTreeRecursive tries to distill a matchTree that matches a
 975		// superset of the regexp. If the returned matchTree is equivalent to the
 976		// original regexp, it returns true. An equivalent matchTree has the same
 977		// behaviour as the original regexp and can be used instead.
 978		//
 979		subMT, isEq, _, err := d.regexpToMatchTreeRecursive(s.Regexp, ngramSize, s.FileName, s.CaseSensitive)
 980		if err != nil {
 981			return nil, err
 982		}
 983		// if the query can be used in place of the regexp
 984		// return the subtree
 985		if isEq {
 986			return subMT, nil
 987		}
 988
 989		var tr matchTree
 990		if wmt, ok := regexpToWordMatchTree(s, opt); ok {
 991			// A common search we get is "\bLITERAL\b". Avoid the regex engine and
 992			// provide something faster.
 993			tr = wmt
 994		} else {
 995			tr = newRegexpMatchTree(s)
 996		}
 997
 998		return &andMatchTree{
 999			children: []matchTree{
1000				tr, &noVisitMatchTree{subMT},
1001			},
1002		}, nil
1003	case *query.And:
1004		var r []matchTree
1005		for _, ch := range s.Children {
1006			ct, err := d.newMatchTree(ch, opt)
1007			if err != nil {
1008				return nil, err
1009			}
1010			r = append(r, ct)
1011		}
1012		return &andMatchTree{r}, nil
1013	case *query.Or:
1014		var r []matchTree
1015		for _, ch := range s.Children {
1016			ct, err := d.newMatchTree(ch, opt)
1017			if err != nil {
1018				return nil, err
1019			}
1020			r = append(r, ct)
1021		}
1022		return &orMatchTree{r}, nil
1023	case *query.Not:
1024		ct, err := d.newMatchTree(s.Child, opt)
1025		return &notMatchTree{
1026			child: ct,
1027		}, err
1028
1029	case *query.Type:
1030		if s.Type != query.TypeFileName {
1031			break
1032		}
1033
1034		ct, err := d.newMatchTree(s.Child, opt)
1035		if err != nil {
1036			return nil, err
1037		}
1038
1039		return &fileNameMatchTree{
1040			child: ct,
1041		}, nil
1042
1043	case *query.Boost:
1044		ct, err := d.newMatchTree(s.Child, opt)
1045		if err != nil {
1046			return nil, err
1047		}
1048
1049		return &boostMatchTree{
1050			child: ct,
1051			boost: s.Boost,
1052		}, nil
1053
1054	case *query.Substring:
1055		return d.newSubstringMatchTree(s)
1056
1057	case *query.Branch:
1058		masks := make([]uint64, 0, len(d.repoMetaData))
1059		if s.Pattern == "HEAD" {
1060			for i := 0; i < len(d.repoMetaData); i++ {
1061				masks = append(masks, 1)
1062			}
1063		} else {
1064			for _, branchIDs := range d.branchIDs {
1065				mask := uint64(0)
1066				for nm, m := range branchIDs {
1067					if (s.Exact && nm == s.Pattern) || (!s.Exact && strings.Contains(nm, s.Pattern)) {
1068						mask |= uint64(m)
1069					}
1070				}
1071				masks = append(masks, mask)
1072			}
1073		}
1074		return &branchQueryMatchTree{
1075			masks:     masks,
1076			fileMasks: d.fileBranchMasks,
1077			repos:     d.repos,
1078		}, nil
1079	case *query.Const:
1080		if s.Value {
1081			return &bruteForceMatchTree{}, nil
1082		} else {
1083			return &noMatchTree{Why: "const"}, nil
1084		}
1085	case *query.Language:
1086		code, ok := d.metaData.LanguageMap[s.Language]
1087		if !ok {
1088			return &noMatchTree{Why: "lang"}, nil
1089		}
1090		return &docMatchTree{
1091			reason:  "language",
1092			numDocs: d.numDocs(),
1093			predicate: func(docID uint32) bool {
1094				return d.getLanguage(docID) == code
1095			},
1096		}, nil
1097
1098	case *query.Symbol:
1099		// Disable WordMatchTree since we don't support it in symbols yet.
1100		optCopy := opt
1101		optCopy.DisableWordMatchOptimization = true
1102
1103		subMT, err := d.newMatchTree(s.Expr, optCopy)
1104		if err != nil {
1105			return nil, err
1106		}
1107
1108		if substr, ok := subMT.(*substrMatchTree); ok {
1109			return &symbolSubstrMatchTree{
1110				substrMatchTree: substr,
1111				patternSize:     uint32(utf8.RuneCountInString(substr.query.Pattern)),
1112				fileEndRunes:    d.fileEndRunes,
1113				fileEndSymbol:   d.fileEndSymbol,
1114				sections:        d.runeDocSections,
1115			}, nil
1116		}
1117
1118		var regexpMT *regexpMatchTree
1119		visitMatchTree(subMT, func(mt matchTree) {
1120			if t, ok := mt.(*regexpMatchTree); ok {
1121				regexpMT = t
1122			}
1123		})
1124		if regexpMT == nil {
1125			return nil, fmt.Errorf("found %T inside query.Symbol", subMT)
1126		}
1127
1128		return &symbolRegexpMatchTree{
1129			regexp:    regexpMT.regexp,
1130			all:       isRegexpAll(regexpMT.origRegexp),
1131			matchTree: subMT,
1132		}, nil
1133
1134	case *query.FileNameSet:
1135		return &docMatchTree{
1136			reason:  "FileNameSet",
1137			numDocs: d.numDocs(),
1138			predicate: func(docID uint32) bool {
1139				fileName := d.fileName(docID)
1140				_, ok := s.Set[string(fileName)]
1141				return ok
1142			},
1143		}, nil
1144
1145	case *query.BranchesRepos:
1146		reposBranchesWant := make([]uint64, len(d.repoMetaData))
1147		for repoIdx := range d.repoMetaData {
1148			var mask uint64
1149			for _, br := range s.List {
1150				if br.Repos.Contains(d.repoMetaData[repoIdx].ID) {
1151					mask |= uint64(d.branchIDs[repoIdx][br.Branch])
1152				}
1153			}
1154			reposBranchesWant[repoIdx] = mask
1155		}
1156		return &docMatchTree{
1157			reason:  "BranchesRepos",
1158			numDocs: d.numDocs(),
1159			predicate: func(docID uint32) bool {
1160				return d.fileBranchMasks[docID]&reposBranchesWant[d.repos[docID]] != 0
1161			},
1162		}, nil
1163
1164	case *query.RepoSet:
1165		reposWant := make([]bool, len(d.repoMetaData))
1166		for repoIdx, r := range d.repoMetaData {
1167			if _, ok := s.Set[r.Name]; ok {
1168				reposWant[repoIdx] = true
1169			}
1170		}
1171		return &docMatchTree{
1172			reason:  "RepoSet",
1173			numDocs: d.numDocs(),
1174			predicate: func(docID uint32) bool {
1175				return reposWant[d.repos[docID]]
1176			},
1177		}, nil
1178
1179	case *query.RepoIDs:
1180		reposWant := make([]bool, len(d.repoMetaData))
1181		for repoIdx, r := range d.repoMetaData {
1182			if s.Repos.Contains(r.ID) {
1183				reposWant[repoIdx] = true
1184			}
1185		}
1186		return &docMatchTree{
1187			reason:  "RepoIDs",
1188			numDocs: d.numDocs(),
1189			predicate: func(docID uint32) bool {
1190				return reposWant[d.repos[docID]]
1191			},
1192		}, nil
1193
1194	case *query.Repo:
1195		reposWant := make([]bool, len(d.repoMetaData))
1196		for repoIdx, r := range d.repoMetaData {
1197			if s.Regexp.MatchString(r.Name) {
1198				reposWant[repoIdx] = true
1199			}
1200		}
1201		return &docMatchTree{
1202			reason:  "Repo",
1203			numDocs: d.numDocs(),
1204			predicate: func(docID uint32) bool {
1205				return reposWant[d.repos[docID]]
1206			},
1207		}, nil
1208
1209	case *query.RepoRegexp:
1210		reposWant := make([]bool, len(d.repoMetaData))
1211		for repoIdx, r := range d.repoMetaData {
1212			if s.Regexp.MatchString(r.Name) {
1213				reposWant[repoIdx] = true
1214			}
1215		}
1216		return &docMatchTree{
1217			reason:  "RepoRegexp",
1218			numDocs: d.numDocs(),
1219			predicate: func(docID uint32) bool {
1220				return reposWant[d.repos[docID]]
1221			},
1222		}, nil
1223
1224	case query.RawConfig:
1225		return &docMatchTree{
1226			reason:  s.String(),
1227			numDocs: d.numDocs(),
1228			predicate: func(docID uint32) bool {
1229				return uint8(s)&d.rawConfigMasks[d.repos[docID]] == uint8(s)
1230			},
1231		}, nil
1232	}
1233	log.Panicf("type %T", q)
1234	return nil, nil
1235}
1236
1237func (d *indexData) newSubstringMatchTree(s *query.Substring) (matchTree, error) {
1238	st := &substrMatchTree{
1239		query:         s,
1240		caseSensitive: s.CaseSensitive,
1241		fileName:      s.FileName,
1242	}
1243
1244	if utf8.RuneCountInString(s.Pattern) < ngramSize {
1245		return newRegexpMatchTree(&query.Regexp{
1246			Regexp:        &syntax.Regexp{Op: syntax.OpLiteral, Rune: []rune(s.Pattern)},
1247			FileName:      s.FileName,
1248			Content:       s.Content,
1249			CaseSensitive: s.CaseSensitive,
1250		}), nil
1251	}
1252
1253	result, err := d.iterateNgrams(s)
1254	if err != nil {
1255		return nil, err
1256	}
1257	st.matchIterator = result
1258	return st, nil
1259}
1260
1261func regexpToWordMatchTree(q *query.Regexp, opt matchTreeOpt) (_ *wordMatchTree, ok bool) {
1262	if opt.DisableWordMatchOptimization {
1263		return nil, false
1264	}
1265	// Needs to be case sensitive
1266	if !q.CaseSensitive || q.Regexp.Flags&syntax.FoldCase != 0 {
1267		return nil, false
1268	}
1269	// We want a regex that looks like Op.Concat[OpWordBoundary OpLiteral OpWordBoundary]
1270	if q.Regexp.Op != syntax.OpConcat || len(q.Regexp.Sub) != 3 {
1271		return nil, false
1272	}
1273	sub := q.Regexp.Sub
1274	if sub[0].Op != syntax.OpWordBoundary || sub[1].Op != syntax.OpLiteral || sub[2].Op != syntax.OpWordBoundary {
1275		return nil, false
1276	}
1277
1278	return &wordMatchTree{
1279		word:     string(sub[1].Rune),
1280		fileName: q.FileName,
1281	}, true
1282}
1283
1284// pruneMatchTree removes impossible branches from the matchTree, as indicated
1285// by substrMatchTree having a noMatchTree and the resulting impossible and clauses and so forth.
1286func pruneMatchTree(mt matchTree) (matchTree, error) {
1287	var err error
1288	switch mt := mt.(type) {
1289	// leaf nodes that we test with the filter:
1290	case *substrMatchTree:
1291		if res, ok := mt.matchIterator.(*ngramIterationResults); ok {
1292			if _, ok := res.matchIterator.(*noMatchTree); ok {
1293				return nil, nil
1294			}
1295		}
1296	// recursive tree structures:
1297	case *andMatchTree:
1298		// Any branch of an and becoming impossible means the entire clause
1299		// is impossible. Otherwise, just handle rewrites.
1300		for i, child := range mt.children {
1301			newChild, err := pruneMatchTree(child)
1302			if err != nil {
1303				return nil, err
1304			}
1305			if newChild == nil {
1306				return nil, nil
1307			}
1308			mt.children[i] = newChild
1309		}
1310	case *orMatchTree:
1311		// *All* branches of an OR becoming impossible means the entire clause
1312		// is impossible. Otherwise, drop impossible subclauses and handle
1313		// rewrites, including simplifying to a singular resulting child branch.
1314		n := 0
1315		for _, child := range mt.children {
1316			newChild, err := pruneMatchTree(child)
1317			if err != nil {
1318				return nil, err
1319			}
1320			if newChild != nil {
1321				mt.children[n] = newChild
1322				n++
1323			}
1324		}
1325		mt.children = mt.children[:n]
1326		if len(mt.children) == 1 {
1327			return mt.children[0], nil
1328		} else if len(mt.children) == 0 {
1329			return nil, nil
1330		}
1331	case *noVisitMatchTree:
1332		mt.matchTree, err = pruneMatchTree(mt.matchTree)
1333		if err != nil {
1334			return nil, err
1335		}
1336		if mt.matchTree == nil {
1337			return nil, nil
1338		}
1339	case *fileNameMatchTree:
1340		mt.child, err = pruneMatchTree(mt.child)
1341		if err != nil {
1342			return nil, err
1343		}
1344		if mt.child == nil {
1345			return nil, nil
1346		}
1347	case *boostMatchTree:
1348		mt.child, err = pruneMatchTree(mt.child)
1349		if err != nil {
1350			return nil, err
1351		}
1352		if mt.child == nil {
1353			return nil, nil
1354		}
1355	case *andLineMatchTree:
1356		child, err := pruneMatchTree(&mt.andMatchTree)
1357		if err != nil {
1358			return nil, err
1359		}
1360		if child == nil {
1361			return nil, nil
1362		}
1363		if c, ok := child.(*andMatchTree); ok {
1364			mt.andMatchTree = *c
1365		} else {
1366			// the and was simplified to a single clause,
1367			// so the linematch portion is irrelevant.
1368			return mt, nil
1369		}
1370	case *notMatchTree:
1371		mt.child, err = pruneMatchTree(mt.child)
1372		if err != nil {
1373			return nil, err
1374		}
1375		if mt.child == nil {
1376			// not false => true
1377			return &bruteForceMatchTree{}, nil
1378		}
1379	// unhandled:
1380	case *docMatchTree:
1381	case *bruteForceMatchTree:
1382	case *regexpMatchTree:
1383	case *wordMatchTree:
1384	}
1385	return mt, err
1386}
1387
1388// isRegexpAll returns true if the query matches all possible lines.
1389//
1390// Note: it is possible for a funky regex to actually match all but this
1391// returns false. This returns true for normal looking regexes like ".*" or
1392// "(?-s:.*)".
1393func isRegexpAll(r *syntax.Regexp) bool {
1394	// Note: we don't care about any flags since we are looking for .* and we
1395	// don't mind if . matches all or everything but newline.
1396
1397	// for loop is for visiting children of OpCapture/OpConcat until we hit .*
1398	for {
1399		// Our main target: .*
1400		if r.Op == syntax.OpStar && len(r.Sub) == 1 { // *
1401			// (?s:.) or (?-s:.)
1402			return r.Sub[0].Op == syntax.OpAnyChar || r.Sub[0].Op == syntax.OpAnyCharNotNL
1403		}
1404
1405		// Strip away expressions being wrapped in paranthesis
1406		if (r.Op == syntax.OpCapture || r.Op == syntax.OpConcat) && len(r.Sub) == 1 {
1407			r = r.Sub[0]
1408			continue
1409		}
1410
1411		return false
1412	}
1413}
Configure Feed

Configure Feed