index/matchtree.go at a4188d22d917b834cca2da69bffc41a6d19302ac · boltless.me/zoekt

fork of https://github.com/sourcegraph/zoekt
zoekt / index / matchtree.go
at a4188d22d917b834cca2da69bffc41a6d19302ac 35 kB View raw
John Mason Add support for indexing and searching custom fields for repositories (#962) 1y ago
   1// Copyright 2018 Google Inc. All rights reserved.
   2//
   3// Licensed under the Apache License, Version 2.0 (the "License");
   4// you may not use this file except in compliance with the License.
   5// You may obtain a copy of the License at
   6//
   7//    http://www.apache.org/licenses/LICENSE-2.0
   8//
   9// Unless required by applicable law or agreed to in writing, software
  10// distributed under the License is distributed on an "AS IS" BASIS,
  11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12// See the License for the specific language governing permissions and
  13// limitations under the License.
  14
  15package index
  16
  17import (
  18	"bytes"
  19	"fmt"
  20	"log"
  21	"math"
  22	"regexp/syntax"
  23	"strings"
  24	"unicode/utf8"
  25
  26	"github.com/grafana/regexp"
  27
  28	"github.com/sourcegraph/zoekt"
  29	"github.com/sourcegraph/zoekt/internal/syntaxutil"
  30	"github.com/sourcegraph/zoekt/query"
  31)
  32
  33// A docIterator iterates over documents in order.
  34type docIterator interface {
  35	// provide the next document where we may find something interesting.
  36	//
  37	// This is like a "peek" and shouldn't mutate state. prepare is what should
  38	// change state.
  39	nextDoc() uint32
  40
  41	// clears any per-document state of the docIterator, and
  42	// prepares for evaluating the given doc. The argument is
  43	// strictly increasing over time.
  44	prepare(nextDoc uint32)
  45}
  46
  47// costs are passed in increasing order to matchTree.matches until they do not
  48// return matchesRequiresHigherCost.
  49const (
  50	costConst   = 0
  51	costMemory  = 1
  52	costContent = 2
  53	costRegexp  = 3
  54)
  55
  56const (
  57	costMin = costConst
  58	costMax = costRegexp
  59)
  60
  61// matchesState is an enum for the state of a matchTree after a call to
  62// matchTree.matches.
  63type matchesState uint8
  64
  65const (
  66	// matchesRequiresHigherCost is returned when matchTree.matches hasn't done
  67	// a search yet since the cost value is not high enough.
  68	matchesRequiresHigherCost matchesState = iota
  69
  70	// matchesFound is returned when matchTree.matches has done a search and
  71	// found one or more matches.
  72	matchesFound
  73
  74	// matchesNone is returned when matchTree.matches has done a search and
  75	// found nothing.
  76	matchesNone
  77)
  78
  79// matchesStatePred is a helper which returns matchesFound if b is true
  80// otherwise returns matchesNone.
  81func matchesStatePred(b bool) matchesState {
  82	if b {
  83		return matchesFound
  84	}
  85	return matchesNone
  86}
  87
  88// matchesStateForSlice is a helper which returns matchesFound if v is
  89// non-empty otherwise returns matchesNone.
  90func matchesStateForSlice[T any](v []T) matchesState {
  91	return matchesStatePred(len(v) > 0)
  92}
  93
  94// An expression tree coupled with matches. The matchtree has two
  95// functions:
  96//
  97// * it implements boolean combinations (and, or, not)
  98//
  99// * it implements shortcuts, where we skip documents (for example: if
 100// there are no trigram matches, we can be sure there are no substring
 101// matches). The matchtree iterates over the documents as they are
 102// ordered in the shard.
 103//
 104// The general process for a given (shard, query) is:
 105//
 106// - construct matchTree for the query
 107//
 108// - find all different leaf matchTrees (substring, regexp, etc.)
 109//
 110// in a loop:
 111//
 112//   - find next doc to process using nextDoc
 113//
 114//   - evaluate atoms (leaf expressions that match text)
 115//
 116//   - evaluate the tree using matches(), storing the result in map.
 117//
 118//   - if the complete tree returns (matches() != matchesRequiresHigherCost)
 119//     for the document, collect all text matches by looking at leaf
 120//     matchTrees.
 121type matchTree interface {
 122	docIterator
 123
 124	// matches if cost is high enough. See documentation for matchesState's
 125	// values.
 126	//
 127	// Note: Do not call this directly, rather use evalMatchTree which uses
 128	// known to cache responses once the state transitions away from
 129	// matchesRequiresHigherCost.
 130	matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState
 131}
 132
 133// docMatchTree iterates over documents for which predicate(docID) returns true.
 134type docMatchTree struct {
 135	// the number of documents in a shard.
 136	numDocs uint32
 137
 138	predicate func(docID uint32) bool
 139
 140	// provides additional information about the reason why the docMatchTree was
 141	// created.
 142	reason string
 143
 144	// mutable
 145	firstDone bool
 146	docID     uint32
 147}
 148
 149type bruteForceMatchTree struct {
 150	// mutable
 151	firstDone bool
 152	docID     uint32
 153}
 154
 155type andLineMatchTree struct {
 156	andMatchTree
 157}
 158
 159type andMatchTree struct {
 160	children []matchTree
 161}
 162
 163type orMatchTree struct {
 164	children []matchTree
 165}
 166
 167type notMatchTree struct {
 168	child matchTree
 169}
 170
 171// Returns only the filename of child matches.
 172type fileNameMatchTree struct {
 173	child matchTree
 174}
 175
 176type boostMatchTree struct {
 177	child matchTree
 178	boost float64
 179}
 180
 181// Don't visit this subtree for collecting matches.
 182type noVisitMatchTree struct {
 183	matchTree
 184}
 185
 186type regexpMatchTree struct {
 187	regexp *regexp.Regexp
 188
 189	// origRegexp is the original parsed regexp from the query structure. It
 190	// does not include mutations such as case sensitivity.
 191	origRegexp *syntax.Regexp
 192
 193	fileName bool
 194
 195	// mutable
 196	reEvaluated bool
 197	found       []*candidateMatch
 198
 199	// nextDoc, prepare.
 200	bruteForceMatchTree
 201}
 202
 203func newRegexpMatchTree(s *query.Regexp) *regexpMatchTree {
 204	prefix := ""
 205	if !s.CaseSensitive {
 206		prefix = "(?i)"
 207	}
 208
 209	return &regexpMatchTree{
 210		regexp:     regexp.MustCompile(prefix + syntaxutil.RegexpString(s.Regexp)),
 211		origRegexp: s.Regexp,
 212		fileName:   s.FileName,
 213	}
 214}
 215
 216// \bLITERAL\b
 217type wordMatchTree struct {
 218	word string
 219
 220	fileName bool
 221
 222	// mutable
 223	evaluated bool
 224	found     []*candidateMatch
 225
 226	// nextDoc, prepare.
 227	bruteForceMatchTree
 228}
 229
 230type substrMatchTree struct {
 231	matchIterator
 232
 233	query         *query.Substring
 234	caseSensitive bool
 235	fileName      bool
 236
 237	// mutable
 238	current       []*candidateMatch
 239	contEvaluated bool
 240}
 241
 242type branchQueryMatchTree struct {
 243	fileMasks []uint64
 244	masks     []uint64
 245	repos     []uint16
 246
 247	// mutable
 248	firstDone bool
 249	docID     uint32
 250}
 251
 252func (t *branchQueryMatchTree) branchMask() uint64 {
 253	return t.fileMasks[t.docID] & t.masks[t.repos[t.docID]]
 254}
 255
 256type symbolRegexpMatchTree struct {
 257	matchTree
 258	regexp *regexp.Regexp
 259	all    bool // skips regex match if .*
 260
 261	reEvaluated bool
 262	found       []*candidateMatch
 263}
 264
 265func (t *symbolRegexpMatchTree) prepare(doc uint32) {
 266	t.reEvaluated = false
 267	t.found = t.found[:0]
 268	t.matchTree.prepare(doc)
 269}
 270
 271func (t *symbolRegexpMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
 272	if t.reEvaluated {
 273		return matchesStateForSlice(t.found)
 274	}
 275
 276	if cost < costRegexp {
 277		return matchesRequiresHigherCost
 278	}
 279
 280	sections := cp.docSections()
 281	content := cp.data(false)
 282
 283	found := t.found[:0]
 284	for i, sec := range sections {
 285		var idx []int
 286		if t.all {
 287			idx = []int{0, int(sec.End - sec.Start)}
 288		} else {
 289			idx = t.regexp.FindIndex(content[sec.Start:sec.End])
 290			if idx == nil {
 291				continue
 292			}
 293		}
 294
 295		cm := &candidateMatch{
 296			byteOffset:  sec.Start + uint32(idx[0]),
 297			byteMatchSz: uint32(idx[1] - idx[0]),
 298			symbol:      true,
 299			symbolIdx:   uint32(i),
 300		}
 301		found = append(found, cm)
 302	}
 303	t.found = found
 304	t.reEvaluated = true
 305
 306	return matchesStateForSlice(t.found)
 307}
 308
 309type symbolSubstrMatchTree struct {
 310	*substrMatchTree
 311
 312	patternSize   uint32
 313	fileEndRunes  []uint32
 314	fileEndSymbol []uint32
 315
 316	doc      uint32
 317	sections []DocumentSection
 318
 319	secID uint32
 320}
 321
 322func (t *symbolSubstrMatchTree) prepare(doc uint32) {
 323	t.substrMatchTree.prepare(doc)
 324	t.doc = doc
 325
 326	var fileStart uint32
 327	if doc > 0 {
 328		fileStart = t.fileEndRunes[doc-1]
 329	}
 330
 331	var sections []DocumentSection
 332	if len(t.sections) > 0 {
 333		most := t.fileEndSymbol[len(t.fileEndSymbol)-1]
 334		if most == uint32(len(t.sections)) {
 335			sections = t.sections[t.fileEndSymbol[doc]:t.fileEndSymbol[doc+1]]
 336		} else {
 337			for t.secID < uint32(len(t.sections)) && t.sections[t.secID].Start < fileStart {
 338				t.secID++
 339			}
 340
 341			fileEnd, symbolEnd := t.fileEndRunes[doc], t.secID
 342			for symbolEnd < uint32(len(t.sections)) && t.sections[symbolEnd].Start < fileEnd {
 343				symbolEnd++
 344			}
 345
 346			sections = t.sections[t.secID:symbolEnd]
 347		}
 348	}
 349
 350	secIdx := 0
 351	trimmed := t.current[:0]
 352	for len(sections) > secIdx && len(t.current) > 0 {
 353		start := fileStart + t.current[0].runeOffset
 354		end := start + t.patternSize
 355		if start >= sections[secIdx].End {
 356			secIdx++
 357			continue
 358		}
 359
 360		if start < sections[secIdx].Start {
 361			t.current = t.current[1:]
 362			continue
 363		}
 364
 365		if end <= sections[secIdx].End {
 366			t.current[0].symbol = true
 367			t.current[0].symbolIdx = uint32(secIdx)
 368			trimmed = append(trimmed, t.current[0])
 369		}
 370
 371		t.current = t.current[1:]
 372	}
 373	t.current = trimmed
 374}
 375
 376// all prepare methods
 377
 378func (t *bruteForceMatchTree) prepare(doc uint32) {
 379	t.docID = doc
 380	t.firstDone = true
 381}
 382
 383func (t *docMatchTree) prepare(doc uint32) {
 384	t.docID = doc
 385	t.firstDone = true
 386}
 387
 388func (t *andMatchTree) prepare(doc uint32) {
 389	for _, c := range t.children {
 390		c.prepare(doc)
 391	}
 392}
 393
 394func (t *regexpMatchTree) prepare(doc uint32) {
 395	t.found = t.found[:0]
 396	t.reEvaluated = false
 397	t.bruteForceMatchTree.prepare(doc)
 398}
 399
 400func (t *wordMatchTree) prepare(doc uint32) {
 401	t.found = t.found[:0]
 402	t.evaluated = false
 403	t.bruteForceMatchTree.prepare(doc)
 404}
 405
 406func (t *orMatchTree) prepare(doc uint32) {
 407	for _, c := range t.children {
 408		c.prepare(doc)
 409	}
 410}
 411
 412func (t *notMatchTree) prepare(doc uint32) {
 413	t.child.prepare(doc)
 414}
 415
 416func (t *fileNameMatchTree) prepare(doc uint32) {
 417	t.child.prepare(doc)
 418}
 419
 420func (t *boostMatchTree) prepare(doc uint32) {
 421	t.child.prepare(doc)
 422}
 423
 424func (t *substrMatchTree) prepare(nextDoc uint32) {
 425	t.matchIterator.prepare(nextDoc)
 426	t.current = t.matchIterator.candidates()
 427	t.contEvaluated = false
 428}
 429
 430func (t *branchQueryMatchTree) prepare(doc uint32) {
 431	t.firstDone = true
 432	t.docID = doc
 433}
 434
 435// nextDoc
 436
 437func (t *docMatchTree) nextDoc() uint32 {
 438	var start uint32
 439	if t.firstDone {
 440		start = t.docID + 1
 441	}
 442	for i := start; i < t.numDocs; i++ {
 443		if t.predicate(i) {
 444			return i
 445		}
 446	}
 447	return math.MaxUint32
 448}
 449
 450func (t *bruteForceMatchTree) nextDoc() uint32 {
 451	if !t.firstDone {
 452		return 0
 453	}
 454	return t.docID + 1
 455}
 456
 457func (t *andMatchTree) nextDoc() uint32 {
 458	var max uint32
 459	for _, c := range t.children {
 460		m := c.nextDoc()
 461		if m > max {
 462			max = m
 463		}
 464	}
 465	return max
 466}
 467
 468func (t *orMatchTree) nextDoc() uint32 {
 469	min := uint32(math.MaxUint32)
 470	for _, c := range t.children {
 471		m := c.nextDoc()
 472		if m < min {
 473			min = m
 474		}
 475	}
 476	return min
 477}
 478
 479func (t *notMatchTree) nextDoc() uint32 {
 480	return 0
 481}
 482
 483func (t *fileNameMatchTree) nextDoc() uint32 {
 484	return t.child.nextDoc()
 485}
 486
 487func (t *boostMatchTree) nextDoc() uint32 {
 488	return t.child.nextDoc()
 489}
 490
 491func (t *branchQueryMatchTree) nextDoc() uint32 {
 492	var start uint32
 493	if t.firstDone {
 494		start = t.docID + 1
 495	}
 496
 497	for i := start; i < uint32(len(t.fileMasks)); i++ {
 498		if (t.masks[t.repos[i]] & t.fileMasks[i]) != 0 {
 499			return i
 500		}
 501	}
 502	return math.MaxUint32
 503}
 504
 505// all String methods
 506
 507func (t *bruteForceMatchTree) String() string {
 508	return "all"
 509}
 510
 511func (t *docMatchTree) String() string {
 512	return fmt.Sprintf("doc(%s)", t.reason)
 513}
 514
 515func (t *andMatchTree) String() string {
 516	return fmt.Sprintf("and%v", t.children)
 517}
 518
 519func (t *regexpMatchTree) String() string {
 520	f := ""
 521	if t.fileName {
 522		f = "f"
 523	}
 524	return fmt.Sprintf("%sre(%s)", f, t.regexp)
 525}
 526
 527func (t *wordMatchTree) String() string {
 528	f := ""
 529	if t.fileName {
 530		f = "f"
 531	}
 532	return fmt.Sprintf("%sword(%s)", f, t.word)
 533}
 534
 535func (t *orMatchTree) String() string {
 536	return fmt.Sprintf("or%v", t.children)
 537}
 538
 539func (t *notMatchTree) String() string {
 540	return fmt.Sprintf("not(%v)", t.child)
 541}
 542
 543func (t *noVisitMatchTree) String() string {
 544	return fmt.Sprintf("novisit(%v)", t.matchTree)
 545}
 546
 547func (t *fileNameMatchTree) String() string {
 548	return fmt.Sprintf("f(%v)", t.child)
 549}
 550
 551func (t *boostMatchTree) String() string {
 552	return fmt.Sprintf("boost(%f, %v)", t.boost, t.child)
 553}
 554
 555func (t *substrMatchTree) String() string {
 556	f := ""
 557	if t.fileName {
 558		f = "f"
 559	}
 560
 561	return fmt.Sprintf("%ssubstr(%q, %v, %v)", f, t.query.Pattern, t.current, t.matchIterator)
 562}
 563
 564func (t *branchQueryMatchTree) String() string {
 565	return fmt.Sprintf("branch(%x)", t.masks)
 566}
 567
 568func (t *symbolSubstrMatchTree) String() string {
 569	return fmt.Sprintf("symbol(%v)", t.substrMatchTree)
 570}
 571
 572func (t *symbolRegexpMatchTree) String() string {
 573	return fmt.Sprintf("symbol(%v)", t.matchTree)
 574}
 575
 576// visitMatches visits all atoms in matchTree. Note: This visits
 577// noVisitMatchTree. For collecting matches use visitMatches.
 578func visitMatchTree(t matchTree, f func(matchTree)) {
 579	switch s := t.(type) {
 580	case *andMatchTree:
 581		for _, ch := range s.children {
 582			visitMatchTree(ch, f)
 583		}
 584	case *orMatchTree:
 585		for _, ch := range s.children {
 586			visitMatchTree(ch, f)
 587		}
 588	case *andLineMatchTree:
 589		visitMatchTree(&s.andMatchTree, f)
 590	case *noVisitMatchTree:
 591		visitMatchTree(s.matchTree, f)
 592	case *notMatchTree:
 593		visitMatchTree(s.child, f)
 594	case *fileNameMatchTree:
 595		visitMatchTree(s.child, f)
 596	case *boostMatchTree:
 597		visitMatchTree(s.child, f)
 598	case *symbolSubstrMatchTree:
 599		visitMatchTree(s.substrMatchTree, f)
 600	case *symbolRegexpMatchTree:
 601		visitMatchTree(s.matchTree, f)
 602	default:
 603		f(t)
 604	}
 605}
 606
 607// updateMatchTreeStats calls updateStats on all atoms in mt which have that
 608// function defined.
 609func updateMatchTreeStats(mt matchTree, stats *zoekt.Stats) {
 610	visitMatchTree(mt, func(mt matchTree) {
 611		if atom, ok := mt.(interface{ updateStats(*zoekt.Stats) }); ok {
 612			atom.updateStats(stats)
 613		}
 614	})
 615}
 616
 617func visitMatchAtoms(t matchTree, known map[matchTree]bool, f func(matchTree)) {
 618	visitMatches(t, known, 1, func(mt matchTree, _ float64) {
 619		f(mt)
 620	})
 621}
 622
 623// visitMatches visits all atoms which can contribute matches. Note: This
 624// skips noVisitMatchTree.
 625func visitMatches(t matchTree, known map[matchTree]bool, weight float64, f func(matchTree, float64)) {
 626	switch s := t.(type) {
 627	case *andMatchTree:
 628		for _, ch := range s.children {
 629			if known[ch] {
 630				visitMatches(ch, known, weight, f)
 631			}
 632		}
 633	case *andLineMatchTree:
 634		visitMatches(&s.andMatchTree, known, weight, f)
 635	case *orMatchTree:
 636		for _, ch := range s.children {
 637			if known[ch] {
 638				visitMatches(ch, known, weight, f)
 639			}
 640		}
 641	case *boostMatchTree:
 642		visitMatches(s.child, known, weight*s.boost, f)
 643	case *symbolSubstrMatchTree:
 644		visitMatches(s.substrMatchTree, known, weight, f)
 645	case *notMatchTree:
 646	case *noVisitMatchTree:
 647		// don't collect into negative trees.
 648	case *fileNameMatchTree:
 649		// We will just gather the filename if we do not visit this tree.
 650	default:
 651		f(s, weight)
 652	}
 653}
 654
 655// all matches() methods.
 656
 657func (t *docMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
 658	return matchesStatePred(t.predicate(cp.idx))
 659}
 660
 661func (t *bruteForceMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
 662	return matchesFound
 663}
 664
 665// andLineMatchTree is a performance optimization of andMatchTree. For content
 666// searches we don't want to run the regex engine if there is no line that
 667// contains matches from all terms.
 668func (t *andLineMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
 669	if state := evalMatchTree(cp, cost, known, &t.andMatchTree); state != matchesFound {
 670		return state
 671	}
 672
 673	// Invariant: all children have matches. If any line contains all of them we
 674	// can return MatchesFound.
 675
 676	// find child with fewest candidates
 677	minCount := math.MaxInt
 678	fewestChildren := 0
 679	for ix, child := range t.children {
 680		v, ok := child.(*substrMatchTree)
 681		// make sure we are running a content search and that all candidates are a
 682		// substrMatchTree
 683		if !ok || v.fileName {
 684			return matchesFound
 685		}
 686		if len(v.current) < minCount {
 687			minCount = len(v.current)
 688			fewestChildren = ix
 689		}
 690	}
 691
 692	type lineRange struct {
 693		start int
 694		end   int
 695	}
 696	lines := make([]lineRange, 0, len(t.children[fewestChildren].(*substrMatchTree).current))
 697	prev := -1
 698	for _, candidate := range t.children[fewestChildren].(*substrMatchTree).current {
 699		line := cp.newlines().atOffset(candidate.byteOffset)
 700		if line == prev {
 701			continue
 702		}
 703		prev = line
 704		byteStart := int(cp.newlines().lineStart(line))
 705		byteEnd := int(cp.newlines().lineStart(line + 1))
 706		lines = append(lines, lineRange{byteStart, byteEnd})
 707	}
 708
 709	// children keeps track of the children's candidates we have already seen.
 710	children := make([][]*candidateMatch, 0, len(t.children)-1)
 711	for j, child := range t.children {
 712		if j == fewestChildren {
 713			continue
 714		}
 715		children = append(children, child.(*substrMatchTree).current)
 716	}
 717
 718nextLine:
 719	for i := 0; i < len(lines); i++ {
 720		hits := 1
 721	nextChild:
 722		for j := range children {
 723		nextCandidate:
 724			for len(children[j]) > 0 {
 725				candidate := children[j][0]
 726				bo := int(cp.findOffset(false, candidate.runeOffset))
 727				if bo < lines[i].start {
 728					children[j] = children[j][1:]
 729					continue nextCandidate
 730				}
 731				if bo < lines[i].end {
 732					hits++
 733					continue nextChild
 734				}
 735				// move the `lines` iterator forward until bo < line.end
 736				for i < len(lines) && bo >= lines[i].end {
 737					i++
 738				}
 739				i--
 740				continue nextLine
 741			}
 742		}
 743		// return early once we found any line that contains matches from all children
 744		if hits == len(t.children) {
 745			return matchesFound
 746		}
 747	}
 748	return matchesNone
 749}
 750
 751func (t *andMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
 752	// We have found matches unless a child needs to do more work or it hasn't
 753	// found matches.
 754	state := matchesFound
 755
 756	for _, ch := range t.children {
 757		switch evalMatchTree(cp, cost, known, ch) {
 758		case matchesRequiresHigherCost:
 759			// keep evaluating other children incase we come across matchesNone
 760			state = matchesRequiresHigherCost
 761		case matchesFound:
 762			// will return this if every child has this value
 763		case matchesNone:
 764			return matchesNone
 765		}
 766	}
 767
 768	return state
 769}
 770
 771func (t *orMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
 772	// we could short-circuit, but we want to use the other possibilities as a
 773	// ranking signal. So we always return the most conservative state.
 774	state := matchesNone
 775	for _, ch := range t.children {
 776		switch evalMatchTree(cp, cost, known, ch) {
 777		case matchesRequiresHigherCost:
 778			state = matchesRequiresHigherCost
 779		case matchesFound:
 780			if state != matchesRequiresHigherCost {
 781				state = matchesFound
 782			}
 783		case matchesNone:
 784			// noop
 785		}
 786	}
 787	return state
 788}
 789
 790func (t *branchQueryMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
 791	return matchesStatePred(t.branchMask() != 0)
 792}
 793
 794func (t *regexpMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
 795	if t.reEvaluated {
 796		return matchesStateForSlice(t.found)
 797	}
 798
 799	if cost < costRegexp {
 800		return matchesRequiresHigherCost
 801	}
 802
 803	cp.stats.RegexpsConsidered++
 804	idxs := t.regexp.FindAllIndex(cp.data(t.fileName), -1)
 805	found := t.found[:0]
 806	for _, idx := range idxs {
 807		cm := &candidateMatch{
 808			byteOffset:  uint32(idx[0]),
 809			byteMatchSz: uint32(idx[1] - idx[0]),
 810			fileName:    t.fileName,
 811		}
 812
 813		found = append(found, cm)
 814	}
 815	t.found = found
 816	t.reEvaluated = true
 817
 818	return matchesStateForSlice(t.found)
 819}
 820
 821func (t *wordMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
 822	if t.evaluated {
 823		return matchesStateForSlice(t.found)
 824	}
 825
 826	if cost < costRegexp {
 827		return matchesRequiresHigherCost
 828	}
 829
 830	data := cp.data(t.fileName)
 831	offset := 0
 832	found := t.found[:0]
 833	for {
 834		idx := bytes.Index(data[offset:], []byte(t.word))
 835		if idx < 0 {
 836			break
 837		}
 838
 839		relStartOffset := offset + idx
 840		relEndOffset := relStartOffset + len(t.word)
 841
 842		startBoundary := relStartOffset < len(data) && (relStartOffset == 0 || !characterClass(data[relStartOffset-1]))
 843		endBoundary := relEndOffset > 0 && (relEndOffset == len(data) || !characterClass(data[relEndOffset]))
 844		if startBoundary && endBoundary {
 845			found = append(found, &candidateMatch{
 846				byteOffset:  uint32(offset + idx),
 847				byteMatchSz: uint32(len(t.word)),
 848				fileName:    t.fileName,
 849			})
 850		}
 851		offset += idx + len(t.word)
 852	}
 853
 854	t.found = found
 855	t.evaluated = true
 856
 857	return matchesStateForSlice(t.found)
 858}
 859
 860// breakMatchesOnNewlines returns matches resulting from breaking each element
 861// of cms on newlines within text.
 862func breakMatchesOnNewlines(cms []*candidateMatch, text []byte) []*candidateMatch {
 863	var lineCMs []*candidateMatch
 864	for _, cm := range cms {
 865		lineCMs = append(lineCMs, breakOnNewlines(cm, text)...)
 866	}
 867	return lineCMs
 868}
 869
 870// breakOnNewlines returns matches resulting from breaking cm on newlines
 871// within text.
 872func breakOnNewlines(cm *candidateMatch, text []byte) []*candidateMatch {
 873	var cms []*candidateMatch
 874	addMe := &candidateMatch{}
 875	*addMe = *cm
 876	for i := uint32(cm.byteOffset); i < cm.byteOffset+cm.byteMatchSz; i++ {
 877		if text[i] == '\n' {
 878			addMe.byteMatchSz = i - addMe.byteOffset
 879			if addMe.byteMatchSz != 0 {
 880				cms = append(cms, addMe)
 881			}
 882
 883			addMe = &candidateMatch{}
 884			*addMe = *cm
 885			addMe.byteOffset = i + 1
 886		}
 887	}
 888	addMe.byteMatchSz = cm.byteOffset + cm.byteMatchSz - addMe.byteOffset
 889	if addMe.byteMatchSz != 0 {
 890		cms = append(cms, addMe)
 891	}
 892	return cms
 893}
 894
 895// evalMatchTree should be called instead of directly calling
 896// matchTree.matches. It cache known values for future evaluation at higher
 897// costs.
 898func evalMatchTree(cp *contentProvider, cost int, known map[matchTree]bool, mt matchTree) matchesState {
 899	if v, ok := known[mt]; ok {
 900		return matchesStatePred(v)
 901	}
 902
 903	ms := mt.matches(cp, cost, known)
 904	if ms != matchesRequiresHigherCost {
 905		known[mt] = ms == matchesFound
 906	}
 907
 908	return ms
 909}
 910
 911func (t *notMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
 912	switch evalMatchTree(cp, cost, known, t.child) {
 913	case matchesRequiresHigherCost:
 914		return matchesRequiresHigherCost
 915	case matchesFound:
 916		return matchesNone
 917	case matchesNone:
 918		return matchesFound
 919	default:
 920		panic("unreachable")
 921	}
 922}
 923
 924func (t *fileNameMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
 925	return evalMatchTree(cp, cost, known, t.child)
 926}
 927
 928func (t *boostMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
 929	return evalMatchTree(cp, cost, known, t.child)
 930}
 931
 932func (t *substrMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
 933	if t.contEvaluated {
 934		return matchesStateForSlice(t.current)
 935	}
 936
 937	if len(t.current) == 0 {
 938		return matchesNone
 939	}
 940
 941	if t.fileName && cost < costMemory {
 942		return matchesRequiresHigherCost
 943	}
 944
 945	if !t.fileName && cost < costContent {
 946		return matchesRequiresHigherCost
 947	}
 948
 949	pruned := t.current[:0]
 950	for _, m := range t.current {
 951		if m.byteOffset == 0 && m.runeOffset > 0 {
 952			m.byteOffset = cp.findOffset(m.fileName, m.runeOffset)
 953		}
 954		if m.matchContent(cp.data(m.fileName)) {
 955			pruned = append(pruned, m)
 956		}
 957	}
 958	t.current = pruned
 959	t.contEvaluated = true
 960
 961	return matchesStateForSlice(t.current)
 962}
 963
 964type matchTreeOpt struct {
 965	// DisableWordMatchOptimization is used to disable the use of wordMatchTree.
 966	// This was added since we do not support wordMatchTree with symbol search.
 967	DisableWordMatchOptimization bool
 968}
 969
 970func (d *indexData) newMatchTree(q query.Q, opt matchTreeOpt) (matchTree, error) {
 971	if q == nil {
 972		return nil, fmt.Errorf("got nil (sub)query")
 973	}
 974	switch s := q.(type) {
 975	case *query.Regexp:
 976		// RegexpToMatchTreeRecursive tries to distill a matchTree that matches a
 977		// superset of the regexp. If the returned matchTree is equivalent to the
 978		// original regexp, it returns true. An equivalent matchTree has the same
 979		// behaviour as the original regexp and can be used instead.
 980		//
 981		subMT, isEq, _, err := d.regexpToMatchTreeRecursive(s.Regexp, ngramSize, s.FileName, s.CaseSensitive)
 982		if err != nil {
 983			return nil, err
 984		}
 985		// if the query can be used in place of the regexp
 986		// return the subtree
 987		if isEq {
 988			return subMT, nil
 989		}
 990
 991		var tr matchTree
 992		if wmt, ok := regexpToWordMatchTree(s, opt); ok {
 993			// A common search we get is "\bLITERAL\b". Avoid the regex engine and
 994			// provide something faster.
 995			tr = wmt
 996		} else {
 997			tr = newRegexpMatchTree(s)
 998		}
 999
1000		return &andMatchTree{
1001			children: []matchTree{
1002				tr, &noVisitMatchTree{subMT},
1003			},
1004		}, nil
1005	case *query.And:
1006		var r []matchTree
1007		for _, ch := range s.Children {
1008			ct, err := d.newMatchTree(ch, opt)
1009			if err != nil {
1010				return nil, err
1011			}
1012			r = append(r, ct)
1013		}
1014		return &andMatchTree{r}, nil
1015	case *query.Or:
1016		var r []matchTree
1017		for _, ch := range s.Children {
1018			ct, err := d.newMatchTree(ch, opt)
1019			if err != nil {
1020				return nil, err
1021			}
1022			r = append(r, ct)
1023		}
1024		return &orMatchTree{r}, nil
1025	case *query.Not:
1026		ct, err := d.newMatchTree(s.Child, opt)
1027		return &notMatchTree{
1028			child: ct,
1029		}, err
1030
1031	case *query.Type:
1032		if s.Type != query.TypeFileName {
1033			break
1034		}
1035
1036		ct, err := d.newMatchTree(s.Child, opt)
1037		if err != nil {
1038			return nil, err
1039		}
1040
1041		return &fileNameMatchTree{
1042			child: ct,
1043		}, nil
1044
1045	case *query.Boost:
1046		ct, err := d.newMatchTree(s.Child, opt)
1047		if err != nil {
1048			return nil, err
1049		}
1050
1051		return &boostMatchTree{
1052			child: ct,
1053			boost: s.Boost,
1054		}, nil
1055
1056	case *query.Meta:
1057		reposWant := make([]bool, len(d.repoMetaData))
1058		for repoIdx, r := range d.repoMetaData {
1059			if r.Metadata != nil {
1060				if val, ok := r.Metadata[s.Field]; ok && s.Value.MatchString(val) {
1061					reposWant[repoIdx] = true
1062				}
1063			}
1064		}
1065
1066		return &docMatchTree{
1067			reason:  "Meta",
1068			numDocs: d.numDocs(),
1069			predicate: func(docID uint32) bool {
1070				repoIdx := d.repos[docID]
1071				if int(repoIdx) >= len(reposWant) {
1072					return false
1073				}
1074				return reposWant[repoIdx]
1075			},
1076		}, nil
1077
1078	case *query.Substring:
1079		return d.newSubstringMatchTree(s)
1080
1081	case *query.Branch:
1082		masks := make([]uint64, 0, len(d.repoMetaData))
1083		if s.Pattern == "HEAD" {
1084			for range d.repoMetaData {
1085				masks = append(masks, 1)
1086			}
1087		} else {
1088			for _, branchIDs := range d.branchIDs {
1089				mask := uint64(0)
1090				for nm, m := range branchIDs {
1091					if (s.Exact && nm == s.Pattern) || (!s.Exact && strings.Contains(nm, s.Pattern)) {
1092						mask |= uint64(m)
1093					}
1094				}
1095				masks = append(masks, mask)
1096			}
1097		}
1098		return &branchQueryMatchTree{
1099			masks:     masks,
1100			fileMasks: d.fileBranchMasks,
1101			repos:     d.repos,
1102		}, nil
1103	case *query.Const:
1104		if s.Value {
1105			return &bruteForceMatchTree{}, nil
1106		} else {
1107			return &noMatchTree{Why: "const"}, nil
1108		}
1109	case *query.Language:
1110		code, ok := d.metaData.LanguageMap[s.Language]
1111		if !ok {
1112			return &noMatchTree{Why: "lang"}, nil
1113		}
1114		return &docMatchTree{
1115			reason:  "language",
1116			numDocs: d.numDocs(),
1117			predicate: func(docID uint32) bool {
1118				return d.getLanguage(docID) == code
1119			},
1120		}, nil
1121
1122	case *query.Symbol:
1123		// Disable WordMatchTree since we don't support it in symbols yet.
1124		optCopy := opt
1125		optCopy.DisableWordMatchOptimization = true
1126
1127		subMT, err := d.newMatchTree(s.Expr, optCopy)
1128		if err != nil {
1129			return nil, err
1130		}
1131
1132		if substr, ok := subMT.(*substrMatchTree); ok {
1133			return &symbolSubstrMatchTree{
1134				substrMatchTree: substr,
1135				patternSize:     uint32(utf8.RuneCountInString(substr.query.Pattern)),
1136				fileEndRunes:    d.fileEndRunes,
1137				fileEndSymbol:   d.fileEndSymbol,
1138				sections:        d.runeDocSections,
1139			}, nil
1140		}
1141
1142		var regexpMT *regexpMatchTree
1143		visitMatchTree(subMT, func(mt matchTree) {
1144			if t, ok := mt.(*regexpMatchTree); ok {
1145				regexpMT = t
1146			}
1147		})
1148		if regexpMT == nil {
1149			return nil, fmt.Errorf("found %T inside query.Symbol", subMT)
1150		}
1151
1152		return &symbolRegexpMatchTree{
1153			regexp:    regexpMT.regexp,
1154			all:       isRegexpAll(regexpMT.origRegexp),
1155			matchTree: subMT,
1156		}, nil
1157
1158	case *query.FileNameSet:
1159		return &docMatchTree{
1160			reason:  "FileNameSet",
1161			numDocs: d.numDocs(),
1162			predicate: func(docID uint32) bool {
1163				fileName := d.fileName(docID)
1164				_, ok := s.Set[string(fileName)]
1165				return ok
1166			},
1167		}, nil
1168
1169	case *query.BranchesRepos:
1170		reposBranchesWant := make([]uint64, len(d.repoMetaData))
1171		for repoIdx := range d.repoMetaData {
1172			var mask uint64
1173			for _, br := range s.List {
1174				if br.Repos.Contains(d.repoMetaData[repoIdx].ID) {
1175					mask |= uint64(d.branchIDs[repoIdx][br.Branch])
1176				}
1177			}
1178			reposBranchesWant[repoIdx] = mask
1179		}
1180		return &docMatchTree{
1181			reason:  "BranchesRepos",
1182			numDocs: d.numDocs(),
1183			predicate: func(docID uint32) bool {
1184				return d.fileBranchMasks[docID]&reposBranchesWant[d.repos[docID]] != 0
1185			},
1186		}, nil
1187
1188	case *query.RepoSet:
1189		reposWant := make([]bool, len(d.repoMetaData))
1190		for repoIdx, r := range d.repoMetaData {
1191			if _, ok := s.Set[r.Name]; ok {
1192				reposWant[repoIdx] = true
1193			}
1194		}
1195		return &docMatchTree{
1196			reason:  "RepoSet",
1197			numDocs: d.numDocs(),
1198			predicate: func(docID uint32) bool {
1199				return reposWant[d.repos[docID]]
1200			},
1201		}, nil
1202
1203	case *query.RepoIDs:
1204		reposWant := make([]bool, len(d.repoMetaData))
1205		for repoIdx, r := range d.repoMetaData {
1206			if s.Repos.Contains(r.ID) {
1207				reposWant[repoIdx] = true
1208			}
1209		}
1210		return &docMatchTree{
1211			reason:  "RepoIDs",
1212			numDocs: d.numDocs(),
1213			predicate: func(docID uint32) bool {
1214				return reposWant[d.repos[docID]]
1215			},
1216		}, nil
1217
1218	case *query.Repo:
1219		reposWant := make([]bool, len(d.repoMetaData))
1220		for repoIdx, r := range d.repoMetaData {
1221			if s.Regexp.MatchString(r.Name) {
1222				reposWant[repoIdx] = true
1223			}
1224		}
1225		return &docMatchTree{
1226			reason:  "Repo",
1227			numDocs: d.numDocs(),
1228			predicate: func(docID uint32) bool {
1229				return reposWant[d.repos[docID]]
1230			},
1231		}, nil
1232
1233	case *query.RepoRegexp:
1234		reposWant := make([]bool, len(d.repoMetaData))
1235		for repoIdx, r := range d.repoMetaData {
1236			if s.Regexp.MatchString(r.Name) {
1237				reposWant[repoIdx] = true
1238			}
1239		}
1240		return &docMatchTree{
1241			reason:  "RepoRegexp",
1242			numDocs: d.numDocs(),
1243			predicate: func(docID uint32) bool {
1244				return reposWant[d.repos[docID]]
1245			},
1246		}, nil
1247
1248	case query.RawConfig:
1249		return &docMatchTree{
1250			reason:  s.String(),
1251			numDocs: d.numDocs(),
1252			predicate: func(docID uint32) bool {
1253				return uint8(s)&d.rawConfigMasks[d.repos[docID]] == uint8(s)
1254			},
1255		}, nil
1256	}
1257	log.Panicf("type %T", q)
1258	return nil, nil
1259}
1260
1261func (d *indexData) newSubstringMatchTree(s *query.Substring) (matchTree, error) {
1262	st := &substrMatchTree{
1263		query:         s,
1264		caseSensitive: s.CaseSensitive,
1265		fileName:      s.FileName,
1266	}
1267
1268	if utf8.RuneCountInString(s.Pattern) < ngramSize {
1269		return newRegexpMatchTree(&query.Regexp{
1270			Regexp:        &syntax.Regexp{Op: syntax.OpLiteral, Rune: []rune(s.Pattern)},
1271			FileName:      s.FileName,
1272			Content:       s.Content,
1273			CaseSensitive: s.CaseSensitive,
1274		}), nil
1275	}
1276
1277	result, err := d.iterateNgrams(s)
1278	if err != nil {
1279		return nil, err
1280	}
1281	st.matchIterator = result
1282	return st, nil
1283}
1284
1285func regexpToWordMatchTree(q *query.Regexp, opt matchTreeOpt) (_ *wordMatchTree, ok bool) {
1286	if opt.DisableWordMatchOptimization {
1287		return nil, false
1288	}
1289	// Needs to be case sensitive
1290	if !q.CaseSensitive || q.Regexp.Flags&syntax.FoldCase != 0 {
1291		return nil, false
1292	}
1293	// We want a regex that looks like Op.Concat[OpWordBoundary OpLiteral OpWordBoundary]
1294	if q.Regexp.Op != syntax.OpConcat || len(q.Regexp.Sub) != 3 {
1295		return nil, false
1296	}
1297	sub := q.Regexp.Sub
1298	if sub[0].Op != syntax.OpWordBoundary || sub[1].Op != syntax.OpLiteral || sub[2].Op != syntax.OpWordBoundary {
1299		return nil, false
1300	}
1301
1302	return &wordMatchTree{
1303		word:     string(sub[1].Rune),
1304		fileName: q.FileName,
1305	}, true
1306}
1307
1308// pruneMatchTree removes impossible branches from the matchTree, as indicated
1309// by substrMatchTree having a noMatchTree and the resulting impossible and clauses and so forth.
1310func pruneMatchTree(mt matchTree) (matchTree, error) {
1311	var err error
1312	switch mt := mt.(type) {
1313	// leaf nodes that we test with the filter:
1314	case *substrMatchTree:
1315		if res, ok := mt.matchIterator.(*ngramIterationResults); ok {
1316			if _, ok := res.matchIterator.(*noMatchTree); ok {
1317				return nil, nil
1318			}
1319		}
1320	// recursive tree structures:
1321	case *andMatchTree:
1322		// Any branch of an and becoming impossible means the entire clause
1323		// is impossible. Otherwise, just handle rewrites.
1324		for i, child := range mt.children {
1325			newChild, err := pruneMatchTree(child)
1326			if err != nil {
1327				return nil, err
1328			}
1329			if newChild == nil {
1330				return nil, nil
1331			}
1332			mt.children[i] = newChild
1333		}
1334	case *orMatchTree:
1335		// *All* branches of an OR becoming impossible means the entire clause
1336		// is impossible. Otherwise, drop impossible subclauses and handle
1337		// rewrites, including simplifying to a singular resulting child branch.
1338		n := 0
1339		for _, child := range mt.children {
1340			newChild, err := pruneMatchTree(child)
1341			if err != nil {
1342				return nil, err
1343			}
1344			if newChild != nil {
1345				mt.children[n] = newChild
1346				n++
1347			}
1348		}
1349		mt.children = mt.children[:n]
1350		if len(mt.children) == 1 {
1351			return mt.children[0], nil
1352		} else if len(mt.children) == 0 {
1353			return nil, nil
1354		}
1355	case *noVisitMatchTree:
1356		mt.matchTree, err = pruneMatchTree(mt.matchTree)
1357		if err != nil {
1358			return nil, err
1359		}
1360		if mt.matchTree == nil {
1361			return nil, nil
1362		}
1363	case *fileNameMatchTree:
1364		mt.child, err = pruneMatchTree(mt.child)
1365		if err != nil {
1366			return nil, err
1367		}
1368		if mt.child == nil {
1369			return nil, nil
1370		}
1371	case *boostMatchTree:
1372		mt.child, err = pruneMatchTree(mt.child)
1373		if err != nil {
1374			return nil, err
1375		}
1376		if mt.child == nil {
1377			return nil, nil
1378		}
1379	case *andLineMatchTree:
1380		child, err := pruneMatchTree(&mt.andMatchTree)
1381		if err != nil {
1382			return nil, err
1383		}
1384		if child == nil {
1385			return nil, nil
1386		}
1387		if c, ok := child.(*andMatchTree); ok {
1388			mt.andMatchTree = *c
1389		} else {
1390			// the and was simplified to a single clause,
1391			// so the linematch portion is irrelevant.
1392			return mt, nil
1393		}
1394	case *notMatchTree:
1395		mt.child, err = pruneMatchTree(mt.child)
1396		if err != nil {
1397			return nil, err
1398		}
1399		if mt.child == nil {
1400			// not false => true
1401			return &bruteForceMatchTree{}, nil
1402		}
1403	// unhandled:
1404	case *docMatchTree:
1405	case *bruteForceMatchTree:
1406	case *regexpMatchTree:
1407	case *wordMatchTree:
1408	}
1409	return mt, err
1410}
1411
1412// isRegexpAll returns true if the query matches all possible lines.
1413//
1414// Note: it is possible for a funky regex to actually match all but this
1415// returns false. This returns true for normal looking regexes like ".*" or
1416// "(?-s:.*)".
1417func isRegexpAll(r *syntax.Regexp) bool {
1418	// Note: we don't care about any flags since we are looking for .* and we
1419	// don't mind if . matches all or everything but newline.
1420
1421	// for loop is for visiting children of OpCapture/OpConcat until we hit .*
1422	for {
1423		// Our main target: .*
1424		if r.Op == syntax.OpStar && len(r.Sub) == 1 { // *
1425			// (?s:.) or (?-s:.)
1426			return r.Sub[0].Op == syntax.OpAnyChar || r.Sub[0].Op == syntax.OpAnyCharNotNL
1427		}
1428
1429		// Strip away expressions being wrapped in paranthesis
1430		if (r.Op == syntax.OpCapture || r.Op == syntax.OpConcat) && len(r.Sub) == 1 {
1431			r = r.Sub[0]
1432			continue
1433		}
1434
1435		return false
1436	}
1437}
Configure Feed

Configure Feed