matchtree.go at 344e33ae3c44c220a48b27e1d2e70e30dcd09a85 · boltless.me/zoekt

fork of https://github.com/sourcegraph/zoekt
zoekt / matchtree.go
at 344e33ae3c44c220a48b27e1d2e70e30dcd09a85 32 kB View raw
Keegan Carruthers-Smith matchtree: always use evalMatchTree (#698) 2y ago
   1// Copyright 2018 Google Inc. All rights reserved.
   2//
   3// Licensed under the Apache License, Version 2.0 (the "License");
   4// you may not use this file except in compliance with the License.
   5// You may obtain a copy of the License at
   6//
   7//    http://www.apache.org/licenses/LICENSE-2.0
   8//
   9// Unless required by applicable law or agreed to in writing, software
  10// distributed under the License is distributed on an "AS IS" BASIS,
  11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12// See the License for the specific language governing permissions and
  13// limitations under the License.
  14
  15package zoekt
  16
  17import (
  18	"bytes"
  19	"fmt"
  20	"log"
  21	"regexp/syntax"
  22	"strings"
  23	"unicode/utf8"
  24
  25	"github.com/grafana/regexp"
  26
  27	"github.com/sourcegraph/zoekt/query"
  28)
  29
  30// A docIterator iterates over documents in order.
  31type docIterator interface {
  32	// provide the next document where we may find something interesting.
  33	//
  34	// This is like a "peek" and shouldn't mutate state. prepare is what should
  35	// change state.
  36	nextDoc() uint32
  37
  38	// clears any per-document state of the docIterator, and
  39	// prepares for evaluating the given doc. The argument is
  40	// strictly increasing over time.
  41	prepare(nextDoc uint32)
  42}
  43
  44// costs are passed in increasing order to matchTree.matches until they do not
  45// return matchesRequiresHigherCost.
  46const (
  47	costConst   = 0
  48	costMemory  = 1
  49	costContent = 2
  50	costRegexp  = 3
  51)
  52
  53const (
  54	costMin = costConst
  55	costMax = costRegexp
  56)
  57
  58// matchesState is an enum for the state of a matchTree after a call to
  59// matchTree.matches.
  60type matchesState uint8
  61
  62const (
  63	// matchesRequiresHigherCost is returned when matchTree.matches hasn't done
  64	// a search yet since the cost value is not high enough.
  65	matchesRequiresHigherCost matchesState = iota
  66
  67	// matchesFound is returned when matchTree.matches has done a search and
  68	// found one or more matches.
  69	matchesFound
  70
  71	// matchesNone is returned when matchTree.matches has done a search and
  72	// found nothing.
  73	matchesNone
  74)
  75
  76// matchesStatePred is a helper which returns matchesFound if b is true
  77// otherwise returns matchesNone.
  78func matchesStatePred(b bool) matchesState {
  79	if b {
  80		return matchesFound
  81	}
  82	return matchesNone
  83}
  84
  85// matchesStateForSlice is a helper which returns matchesFound if v is
  86// non-empty otherwise returns matchesNone.
  87func matchesStateForSlice[T any](v []T) matchesState {
  88	return matchesStatePred(len(v) > 0)
  89}
  90
  91// An expression tree coupled with matches. The matchtree has two
  92// functions:
  93//
  94// * it implements boolean combinations (and, or, not)
  95//
  96// * it implements shortcuts, where we skip documents (for example: if
  97// there are no trigram matches, we can be sure there are no substring
  98// matches). The matchtree iterates over the documents as they are
  99// ordered in the shard.
 100//
 101// The general process for a given (shard, query) is:
 102//
 103// - construct matchTree for the query
 104//
 105// - find all different leaf matchTrees (substring, regexp, etc.)
 106//
 107// in a loop:
 108//
 109//   - find next doc to process using nextDoc
 110//
 111//   - evaluate atoms (leaf expressions that match text)
 112//
 113//   - evaluate the tree using matches(), storing the result in map.
 114//
 115//   - if the complete tree returns (matches() != matchesRequiresHigherCost)
 116//     for the document, collect all text matches by looking at leaf
 117//     matchTrees.
 118type matchTree interface {
 119	docIterator
 120
 121	// matches if cost is high enough. See documentation for matchesState's
 122	// values.
 123	//
 124	// Note: Do not call this directly, rather use evalMatchTree which uses
 125	// known to cache responses once the state transitions away from
 126	// matchesRequiresHigherCost.
 127	matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState
 128}
 129
 130// docMatchTree iterates over documents for which predicate(docID) returns true.
 131type docMatchTree struct {
 132	// the number of documents in a shard.
 133	numDocs uint32
 134
 135	predicate func(docID uint32) bool
 136
 137	// provides additional information about the reason why the docMatchTree was
 138	// created.
 139	reason string
 140
 141	// mutable
 142	firstDone bool
 143	docID     uint32
 144}
 145
 146type bruteForceMatchTree struct {
 147	// mutable
 148	firstDone bool
 149	docID     uint32
 150}
 151
 152type andLineMatchTree struct {
 153	andMatchTree
 154}
 155
 156type andMatchTree struct {
 157	children []matchTree
 158}
 159
 160type orMatchTree struct {
 161	children []matchTree
 162}
 163
 164type notMatchTree struct {
 165	child matchTree
 166}
 167
 168// Returns only the filename of child matches.
 169type fileNameMatchTree struct {
 170	child matchTree
 171}
 172
 173// Don't visit this subtree for collecting matches.
 174type noVisitMatchTree struct {
 175	matchTree
 176}
 177
 178type regexpMatchTree struct {
 179	regexp *regexp.Regexp
 180
 181	fileName bool
 182
 183	// mutable
 184	reEvaluated bool
 185	found       []*candidateMatch
 186
 187	// nextDoc, prepare.
 188	bruteForceMatchTree
 189}
 190
 191// \bLITERAL\b
 192type wordMatchTree struct {
 193	word string
 194
 195	fileName bool
 196
 197	// mutable
 198	evaluated bool
 199	found     []*candidateMatch
 200
 201	// nextDoc, prepare.
 202	bruteForceMatchTree
 203}
 204
 205type substrMatchTree struct {
 206	matchIterator
 207
 208	query         *query.Substring
 209	caseSensitive bool
 210	fileName      bool
 211
 212	// mutable
 213	current       []*candidateMatch
 214	contEvaluated bool
 215}
 216
 217type branchQueryMatchTree struct {
 218	fileMasks []uint64
 219	masks     []uint64
 220	repos     []uint16
 221
 222	// mutable
 223	firstDone bool
 224	docID     uint32
 225}
 226
 227func (t *branchQueryMatchTree) branchMask() uint64 {
 228	return t.fileMasks[t.docID] & t.masks[t.repos[t.docID]]
 229}
 230
 231type symbolRegexpMatchTree struct {
 232	matchTree
 233	regexp *regexp.Regexp
 234	all    bool // skips regex match if .*
 235
 236	reEvaluated bool
 237	found       []*candidateMatch
 238}
 239
 240func (t *symbolRegexpMatchTree) prepare(doc uint32) {
 241	t.reEvaluated = false
 242	t.found = t.found[:0]
 243	t.matchTree.prepare(doc)
 244}
 245
 246func (t *symbolRegexpMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
 247	if t.reEvaluated {
 248		return matchesStateForSlice(t.found)
 249	}
 250
 251	if cost < costRegexp {
 252		return matchesRequiresHigherCost
 253	}
 254
 255	sections := cp.docSections()
 256	content := cp.data(false)
 257
 258	found := t.found[:0]
 259	for i, sec := range sections {
 260		var idx []int
 261		if t.all {
 262			idx = []int{0, int(sec.End - sec.Start)}
 263		} else {
 264			idx = t.regexp.FindIndex(content[sec.Start:sec.End])
 265			if idx == nil {
 266				continue
 267			}
 268		}
 269
 270		cm := &candidateMatch{
 271			byteOffset:  sec.Start + uint32(idx[0]),
 272			byteMatchSz: uint32(idx[1] - idx[0]),
 273			symbol:      true,
 274			symbolIdx:   uint32(i),
 275		}
 276		found = append(found, cm)
 277	}
 278	t.found = found
 279	t.reEvaluated = true
 280
 281	return matchesStateForSlice(t.found)
 282}
 283
 284type symbolSubstrMatchTree struct {
 285	*substrMatchTree
 286
 287	patternSize   uint32
 288	fileEndRunes  []uint32
 289	fileEndSymbol []uint32
 290
 291	doc      uint32
 292	sections []DocumentSection
 293
 294	secID uint32
 295}
 296
 297func (t *symbolSubstrMatchTree) prepare(doc uint32) {
 298	t.substrMatchTree.prepare(doc)
 299	t.doc = doc
 300
 301	var fileStart uint32
 302	if doc > 0 {
 303		fileStart = t.fileEndRunes[doc-1]
 304	}
 305
 306	var sections []DocumentSection
 307	if len(t.sections) > 0 {
 308		most := t.fileEndSymbol[len(t.fileEndSymbol)-1]
 309		if most == uint32(len(t.sections)) {
 310			sections = t.sections[t.fileEndSymbol[doc]:t.fileEndSymbol[doc+1]]
 311		} else {
 312			for t.secID < uint32(len(t.sections)) && t.sections[t.secID].Start < fileStart {
 313				t.secID++
 314			}
 315
 316			fileEnd, symbolEnd := t.fileEndRunes[doc], t.secID
 317			for symbolEnd < uint32(len(t.sections)) && t.sections[symbolEnd].Start < fileEnd {
 318				symbolEnd++
 319			}
 320
 321			sections = t.sections[t.secID:symbolEnd]
 322		}
 323	}
 324
 325	secIdx := 0
 326	trimmed := t.current[:0]
 327	for len(sections) > secIdx && len(t.current) > 0 {
 328		start := fileStart + t.current[0].runeOffset
 329		end := start + t.patternSize
 330		if start >= sections[secIdx].End {
 331			secIdx++
 332			continue
 333		}
 334
 335		if start < sections[secIdx].Start {
 336			t.current = t.current[1:]
 337			continue
 338		}
 339
 340		if end <= sections[secIdx].End {
 341			t.current[0].symbol = true
 342			t.current[0].symbolIdx = uint32(secIdx)
 343			trimmed = append(trimmed, t.current[0])
 344		}
 345
 346		t.current = t.current[1:]
 347	}
 348	t.current = trimmed
 349}
 350
 351// all prepare methods
 352
 353func (t *bruteForceMatchTree) prepare(doc uint32) {
 354	t.docID = doc
 355	t.firstDone = true
 356}
 357
 358func (t *docMatchTree) prepare(doc uint32) {
 359	t.docID = doc
 360	t.firstDone = true
 361}
 362
 363func (t *andMatchTree) prepare(doc uint32) {
 364	for _, c := range t.children {
 365		c.prepare(doc)
 366	}
 367}
 368
 369func (t *regexpMatchTree) prepare(doc uint32) {
 370	t.found = t.found[:0]
 371	t.reEvaluated = false
 372	t.bruteForceMatchTree.prepare(doc)
 373}
 374
 375func (t *wordMatchTree) prepare(doc uint32) {
 376	t.found = t.found[:0]
 377	t.evaluated = false
 378	t.bruteForceMatchTree.prepare(doc)
 379}
 380
 381func (t *orMatchTree) prepare(doc uint32) {
 382	for _, c := range t.children {
 383		c.prepare(doc)
 384	}
 385}
 386
 387func (t *notMatchTree) prepare(doc uint32) {
 388	t.child.prepare(doc)
 389}
 390
 391func (t *fileNameMatchTree) prepare(doc uint32) {
 392	t.child.prepare(doc)
 393}
 394
 395func (t *substrMatchTree) prepare(nextDoc uint32) {
 396	t.matchIterator.prepare(nextDoc)
 397	t.current = t.matchIterator.candidates()
 398	t.contEvaluated = false
 399}
 400
 401func (t *branchQueryMatchTree) prepare(doc uint32) {
 402	t.firstDone = true
 403	t.docID = doc
 404}
 405
 406// nextDoc
 407
 408func (t *docMatchTree) nextDoc() uint32 {
 409	var start uint32
 410	if t.firstDone {
 411		start = t.docID + 1
 412	}
 413	for i := start; i < t.numDocs; i++ {
 414		if t.predicate(i) {
 415			return i
 416		}
 417	}
 418	return maxUInt32
 419}
 420
 421func (t *bruteForceMatchTree) nextDoc() uint32 {
 422	if !t.firstDone {
 423		return 0
 424	}
 425	return t.docID + 1
 426}
 427
 428func (t *andMatchTree) nextDoc() uint32 {
 429	var max uint32
 430	for _, c := range t.children {
 431		m := c.nextDoc()
 432		if m > max {
 433			max = m
 434		}
 435	}
 436	return max
 437}
 438
 439func (t *orMatchTree) nextDoc() uint32 {
 440	min := uint32(maxUInt32)
 441	for _, c := range t.children {
 442		m := c.nextDoc()
 443		if m < min {
 444			min = m
 445		}
 446	}
 447	return min
 448}
 449
 450func (t *notMatchTree) nextDoc() uint32 {
 451	return 0
 452}
 453
 454func (t *fileNameMatchTree) nextDoc() uint32 {
 455	return t.child.nextDoc()
 456}
 457
 458func (t *branchQueryMatchTree) nextDoc() uint32 {
 459	var start uint32
 460	if t.firstDone {
 461		start = t.docID + 1
 462	}
 463
 464	for i := start; i < uint32(len(t.fileMasks)); i++ {
 465		if (t.masks[t.repos[i]] & t.fileMasks[i]) != 0 {
 466			return i
 467		}
 468	}
 469	return maxUInt32
 470}
 471
 472// all String methods
 473
 474func (t *bruteForceMatchTree) String() string {
 475	return "all"
 476}
 477
 478func (t *docMatchTree) String() string {
 479	return fmt.Sprintf("doc(%s)", t.reason)
 480}
 481
 482func (t *andMatchTree) String() string {
 483	return fmt.Sprintf("and%v", t.children)
 484}
 485
 486func (t *regexpMatchTree) String() string {
 487	f := ""
 488	if t.fileName {
 489		f = "f"
 490	}
 491	return fmt.Sprintf("%sre(%s)", f, t.regexp)
 492}
 493
 494func (t *wordMatchTree) String() string {
 495	f := ""
 496	if t.fileName {
 497		f = "f"
 498	}
 499	return fmt.Sprintf("%sword(%s)", f, t.word)
 500}
 501
 502func (t *orMatchTree) String() string {
 503	return fmt.Sprintf("or%v", t.children)
 504}
 505
 506func (t *notMatchTree) String() string {
 507	return fmt.Sprintf("not(%v)", t.child)
 508}
 509
 510func (t *noVisitMatchTree) String() string {
 511	return fmt.Sprintf("novisit(%v)", t.matchTree)
 512}
 513
 514func (t *fileNameMatchTree) String() string {
 515	return fmt.Sprintf("f(%v)", t.child)
 516}
 517
 518func (t *substrMatchTree) String() string {
 519	f := ""
 520	if t.fileName {
 521		f = "f"
 522	}
 523
 524	return fmt.Sprintf("%ssubstr(%q, %v, %v)", f, t.query.Pattern, t.current, t.matchIterator)
 525}
 526
 527func (t *branchQueryMatchTree) String() string {
 528	return fmt.Sprintf("branch(%x)", t.masks)
 529}
 530
 531func (t *symbolSubstrMatchTree) String() string {
 532	return fmt.Sprintf("symbol(%v)", t.substrMatchTree)
 533}
 534
 535func (t *symbolRegexpMatchTree) String() string {
 536	return fmt.Sprintf("symbol(%v)", t.matchTree)
 537}
 538
 539// visitMatches visits all atoms in matchTree. Note: This visits
 540// noVisitMatchTree. For collecting matches use visitMatches.
 541func visitMatchTree(t matchTree, f func(matchTree)) {
 542	switch s := t.(type) {
 543	case *andMatchTree:
 544		for _, ch := range s.children {
 545			visitMatchTree(ch, f)
 546		}
 547	case *orMatchTree:
 548		for _, ch := range s.children {
 549			visitMatchTree(ch, f)
 550		}
 551	case *andLineMatchTree:
 552		visitMatchTree(&s.andMatchTree, f)
 553	case *noVisitMatchTree:
 554		visitMatchTree(s.matchTree, f)
 555	case *notMatchTree:
 556		visitMatchTree(s.child, f)
 557	case *fileNameMatchTree:
 558		visitMatchTree(s.child, f)
 559	case *symbolSubstrMatchTree:
 560		visitMatchTree(s.substrMatchTree, f)
 561	case *symbolRegexpMatchTree:
 562		visitMatchTree(s.matchTree, f)
 563	default:
 564		f(t)
 565	}
 566}
 567
 568// updateMatchTreeStats calls updateStats on all atoms in mt which have that
 569// function defined.
 570func updateMatchTreeStats(mt matchTree, stats *Stats) {
 571	visitMatchTree(mt, func(mt matchTree) {
 572		if atom, ok := mt.(interface{ updateStats(*Stats) }); ok {
 573			atom.updateStats(stats)
 574		}
 575	})
 576}
 577
 578// visitMatches visits all atoms which can contribute matches. Note: This
 579// skips noVisitMatchTree.
 580func visitMatches(t matchTree, known map[matchTree]bool, f func(matchTree)) {
 581	switch s := t.(type) {
 582	case *andMatchTree:
 583		for _, ch := range s.children {
 584			if known[ch] {
 585				visitMatches(ch, known, f)
 586			}
 587		}
 588	case *andLineMatchTree:
 589		visitMatches(&s.andMatchTree, known, f)
 590	case *orMatchTree:
 591		for _, ch := range s.children {
 592			if known[ch] {
 593				visitMatches(ch, known, f)
 594			}
 595		}
 596	case *symbolSubstrMatchTree:
 597		visitMatches(s.substrMatchTree, known, f)
 598	case *notMatchTree:
 599	case *noVisitMatchTree:
 600		// don't collect into negative trees.
 601	case *fileNameMatchTree:
 602		// We will just gather the filename if we do not visit this tree.
 603	default:
 604		f(s)
 605	}
 606}
 607
 608// all matches() methods.
 609
 610func (t *docMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
 611	return matchesStatePred(t.predicate(cp.idx))
 612}
 613
 614func (t *bruteForceMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
 615	return matchesFound
 616}
 617
 618// andLineMatchTree is a performance optimization of andMatchTree. For content
 619// searches we don't want to run the regex engine if there is no line that
 620// contains matches from all terms.
 621func (t *andLineMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
 622	if state := evalMatchTree(cp, cost, known, &t.andMatchTree); state != matchesFound {
 623		return state
 624	}
 625
 626	// Invariant: all children have matches. If any line contains all of them we
 627	// can return MatchesFound.
 628
 629	// find child with fewest candidates
 630	min := maxUInt32
 631	fewestChildren := 0
 632	for ix, child := range t.children {
 633		v, ok := child.(*substrMatchTree)
 634		// make sure we are running a content search and that all candidates are a
 635		// substrMatchTree
 636		if !ok || v.fileName {
 637			return matchesFound
 638		}
 639		if len(v.current) < min {
 640			min = len(v.current)
 641			fewestChildren = ix
 642		}
 643	}
 644
 645	type lineRange struct {
 646		start int
 647		end   int
 648	}
 649	lines := make([]lineRange, 0, len(t.children[fewestChildren].(*substrMatchTree).current))
 650	prev := -1
 651	for _, candidate := range t.children[fewestChildren].(*substrMatchTree).current {
 652		line, byteStart, byteEnd := cp.newlines().atOffset(candidate.byteOffset)
 653		if line == prev {
 654			continue
 655		}
 656		prev = line
 657		lines = append(lines, lineRange{byteStart, byteEnd})
 658	}
 659
 660	// children keeps track of the children's candidates we have already seen.
 661	children := make([][]*candidateMatch, 0, len(t.children)-1)
 662	for j, child := range t.children {
 663		if j == fewestChildren {
 664			continue
 665		}
 666		children = append(children, child.(*substrMatchTree).current)
 667	}
 668
 669nextLine:
 670	for i := 0; i < len(lines); i++ {
 671		hits := 1
 672	nextChild:
 673		for j := range children {
 674		nextCandidate:
 675			for len(children[j]) > 0 {
 676				candidate := children[j][0]
 677				bo := int(cp.findOffset(false, candidate.runeOffset))
 678				if bo < lines[i].start {
 679					children[j] = children[j][1:]
 680					continue nextCandidate
 681				}
 682				if bo <= lines[i].end {
 683					hits++
 684					continue nextChild
 685				}
 686				// move the `lines` iterator forward until bo <= line.end
 687				for i < len(lines) && bo > lines[i].end {
 688					i++
 689				}
 690				i--
 691				continue nextLine
 692			}
 693		}
 694		// return early once we found any line that contains matches from all children
 695		if hits == len(t.children) {
 696			return matchesFound
 697		}
 698	}
 699	return matchesNone
 700}
 701
 702func (t *andMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
 703	// We have found matches unless a child needs to do more work or it hasn't
 704	// found matches.
 705	state := matchesFound
 706
 707	for _, ch := range t.children {
 708		switch evalMatchTree(cp, cost, known, ch) {
 709		case matchesRequiresHigherCost:
 710			// keep evaluating other children incase we come across matchesNone
 711			state = matchesRequiresHigherCost
 712		case matchesFound:
 713			// will return this if every child has this value
 714		case matchesNone:
 715			return matchesNone
 716		}
 717	}
 718
 719	return state
 720}
 721
 722func (t *orMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
 723	// we could short-circuit, but we want to use the other possibilities as a
 724	// ranking signal. So we always return the most conservative state.
 725	state := matchesNone
 726	for _, ch := range t.children {
 727		switch evalMatchTree(cp, cost, known, ch) {
 728		case matchesRequiresHigherCost:
 729			state = matchesRequiresHigherCost
 730		case matchesFound:
 731			if state != matchesRequiresHigherCost {
 732				state = matchesFound
 733			}
 734		case matchesNone:
 735			// noop
 736		}
 737	}
 738	return state
 739}
 740
 741func (t *branchQueryMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
 742	return matchesStatePred(t.branchMask() != 0)
 743}
 744
 745func (t *regexpMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
 746	if t.reEvaluated {
 747		return matchesStateForSlice(t.found)
 748	}
 749
 750	if cost < costRegexp {
 751		return matchesRequiresHigherCost
 752	}
 753
 754	cp.stats.RegexpsConsidered++
 755	idxs := t.regexp.FindAllIndex(cp.data(t.fileName), -1)
 756	found := t.found[:0]
 757	for _, idx := range idxs {
 758		cm := &candidateMatch{
 759			byteOffset:  uint32(idx[0]),
 760			byteMatchSz: uint32(idx[1] - idx[0]),
 761			fileName:    t.fileName,
 762		}
 763
 764		found = append(found, cm)
 765	}
 766	t.found = found
 767	t.reEvaluated = true
 768
 769	return matchesStateForSlice(t.found)
 770}
 771
 772func (t *wordMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
 773	if t.evaluated {
 774		return matchesStateForSlice(t.found)
 775	}
 776
 777	if cost < costRegexp {
 778		return matchesRequiresHigherCost
 779	}
 780
 781	data := cp.data(t.fileName)
 782	offset := 0
 783	found := t.found[:0]
 784	for {
 785		idx := bytes.Index(data[offset:], []byte(t.word))
 786		if idx < 0 {
 787			break
 788		}
 789
 790		relStartOffset := offset + idx
 791		relEndOffset := relStartOffset + len(t.word)
 792
 793		startBoundary := relStartOffset < len(data) && (relStartOffset == 0 || !characterClass(data[relStartOffset-1]))
 794		endBoundary := relEndOffset > 0 && (relEndOffset == len(data) || !characterClass(data[relEndOffset]))
 795		if startBoundary && endBoundary {
 796			found = append(found, &candidateMatch{
 797				byteOffset:  uint32(offset + idx),
 798				byteMatchSz: uint32(len(t.word)),
 799				fileName:    t.fileName,
 800			})
 801		}
 802		offset += idx + len(t.word)
 803	}
 804
 805	t.found = found
 806	t.evaluated = true
 807
 808	return matchesStateForSlice(t.found)
 809}
 810
 811// breakMatchesOnNewlines returns matches resulting from breaking each element
 812// of cms on newlines within text.
 813func breakMatchesOnNewlines(cms []*candidateMatch, text []byte) []*candidateMatch {
 814	var lineCMs []*candidateMatch
 815	for _, cm := range cms {
 816		lineCMs = append(lineCMs, breakOnNewlines(cm, text)...)
 817	}
 818	return lineCMs
 819}
 820
 821// breakOnNewlines returns matches resulting from breaking cm on newlines
 822// within text.
 823func breakOnNewlines(cm *candidateMatch, text []byte) []*candidateMatch {
 824	var cms []*candidateMatch
 825	addMe := &candidateMatch{}
 826	*addMe = *cm
 827	for i := uint32(cm.byteOffset); i < cm.byteOffset+cm.byteMatchSz; i++ {
 828		if text[i] == '\n' {
 829			addMe.byteMatchSz = i - addMe.byteOffset
 830			if addMe.byteMatchSz != 0 {
 831				cms = append(cms, addMe)
 832			}
 833
 834			addMe = &candidateMatch{}
 835			*addMe = *cm
 836			addMe.byteOffset = i + 1
 837		}
 838	}
 839	addMe.byteMatchSz = cm.byteOffset + cm.byteMatchSz - addMe.byteOffset
 840	if addMe.byteMatchSz != 0 {
 841		cms = append(cms, addMe)
 842	}
 843	return cms
 844}
 845
 846// evalMatchTree should be called instead of directly calling
 847// matchTree.matches. It cache known values for future evaluation at higher
 848// costs.
 849func evalMatchTree(cp *contentProvider, cost int, known map[matchTree]bool, mt matchTree) matchesState {
 850	if v, ok := known[mt]; ok {
 851		return matchesStatePred(v)
 852	}
 853
 854	ms := mt.matches(cp, cost, known)
 855	if ms != matchesRequiresHigherCost {
 856		known[mt] = ms == matchesFound
 857	}
 858
 859	return ms
 860}
 861
 862func (t *notMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
 863	switch evalMatchTree(cp, cost, known, t.child) {
 864	case matchesRequiresHigherCost:
 865		return matchesRequiresHigherCost
 866	case matchesFound:
 867		return matchesNone
 868	case matchesNone:
 869		return matchesFound
 870	default:
 871		panic("unreachable")
 872	}
 873}
 874
 875func (t *fileNameMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
 876	return evalMatchTree(cp, cost, known, t.child)
 877}
 878
 879func (t *substrMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
 880	if t.contEvaluated {
 881		return matchesStateForSlice(t.current)
 882	}
 883
 884	if len(t.current) == 0 {
 885		return matchesNone
 886	}
 887
 888	if t.fileName && cost < costMemory {
 889		return matchesRequiresHigherCost
 890	}
 891
 892	if !t.fileName && cost < costContent {
 893		return matchesRequiresHigherCost
 894	}
 895
 896	pruned := t.current[:0]
 897	for _, m := range t.current {
 898		if m.byteOffset == 0 && m.runeOffset > 0 {
 899			m.byteOffset = cp.findOffset(m.fileName, m.runeOffset)
 900		}
 901		if m.matchContent(cp.data(m.fileName)) {
 902			pruned = append(pruned, m)
 903		}
 904	}
 905	t.current = pruned
 906	t.contEvaluated = true
 907
 908	return matchesStateForSlice(t.current)
 909}
 910
 911type matchTreeOpt struct {
 912	// DisableWordMatchOptimization is used to disable the use of wordMatchTree.
 913	// This was added since we do not support wordMatchTree with symbol search.
 914	DisableWordMatchOptimization bool
 915}
 916
 917func (d *indexData) newMatchTree(q query.Q, opt matchTreeOpt) (matchTree, error) {
 918	if q == nil {
 919		return nil, fmt.Errorf("got nil (sub)query")
 920	}
 921	switch s := q.(type) {
 922	case *query.Regexp:
 923		// RegexpToMatchTreeRecursive tries to distill a matchTree that matches a
 924		// superset of the regexp. If the returned matchTree is equivalent to the
 925		// original regexp, it returns true. An equivalent matchTree has the same
 926		// behaviour as the original regexp and can be used instead.
 927		//
 928		subMT, isEq, _, err := d.regexpToMatchTreeRecursive(s.Regexp, ngramSize, s.FileName, s.CaseSensitive)
 929		if err != nil {
 930			return nil, err
 931		}
 932		// if the query can be used in place of the regexp
 933		// return the subtree
 934		if isEq {
 935			return subMT, nil
 936		}
 937
 938		var tr matchTree
 939		if wmt, ok := regexpToWordMatchTree(s, opt); ok {
 940			// A common search we get is "\bLITERAL\b". Avoid the regex engine and
 941			// provide something faster.
 942			tr = wmt
 943		} else {
 944			prefix := ""
 945			if !s.CaseSensitive {
 946				prefix = "(?i)"
 947			}
 948
 949			tr = &regexpMatchTree{
 950				regexp:   regexp.MustCompile(prefix + s.Regexp.String()),
 951				fileName: s.FileName,
 952			}
 953		}
 954
 955		return &andMatchTree{
 956			children: []matchTree{
 957				tr, &noVisitMatchTree{subMT},
 958			},
 959		}, nil
 960	case *query.And:
 961		var r []matchTree
 962		for _, ch := range s.Children {
 963			ct, err := d.newMatchTree(ch, opt)
 964			if err != nil {
 965				return nil, err
 966			}
 967			r = append(r, ct)
 968		}
 969		return &andMatchTree{r}, nil
 970	case *query.Or:
 971		var r []matchTree
 972		for _, ch := range s.Children {
 973			ct, err := d.newMatchTree(ch, opt)
 974			if err != nil {
 975				return nil, err
 976			}
 977			r = append(r, ct)
 978		}
 979		return &orMatchTree{r}, nil
 980	case *query.Not:
 981		ct, err := d.newMatchTree(s.Child, opt)
 982		return &notMatchTree{
 983			child: ct,
 984		}, err
 985
 986	case *query.Type:
 987		if s.Type != query.TypeFileName {
 988			break
 989		}
 990
 991		ct, err := d.newMatchTree(s.Child, opt)
 992		if err != nil {
 993			return nil, err
 994		}
 995
 996		return &fileNameMatchTree{
 997			child: ct,
 998		}, nil
 999
1000	case *query.Substring:
1001		return d.newSubstringMatchTree(s)
1002
1003	case *query.Branch:
1004		masks := make([]uint64, 0, len(d.repoMetaData))
1005		if s.Pattern == "HEAD" {
1006			for i := 0; i < len(d.repoMetaData); i++ {
1007				masks = append(masks, 1)
1008			}
1009		} else {
1010			for _, branchIDs := range d.branchIDs {
1011				mask := uint64(0)
1012				for nm, m := range branchIDs {
1013					if (s.Exact && nm == s.Pattern) || (!s.Exact && strings.Contains(nm, s.Pattern)) {
1014						mask |= uint64(m)
1015					}
1016				}
1017				masks = append(masks, mask)
1018			}
1019
1020		}
1021		return &branchQueryMatchTree{
1022			masks:     masks,
1023			fileMasks: d.fileBranchMasks,
1024			repos:     d.repos,
1025		}, nil
1026	case *query.Const:
1027		if s.Value {
1028			return &bruteForceMatchTree{}, nil
1029		} else {
1030			return &noMatchTree{Why: "const"}, nil
1031		}
1032	case *query.Language:
1033		code, ok := d.metaData.LanguageMap[s.Language]
1034		if !ok {
1035			return &noMatchTree{Why: "lang"}, nil
1036		}
1037		return &docMatchTree{
1038			reason:  "language",
1039			numDocs: d.numDocs(),
1040			predicate: func(docID uint32) bool {
1041				return d.getLanguage(docID) == code
1042			},
1043		}, nil
1044
1045	case *query.Symbol:
1046		// Disable WordMatchTree since we don't support it in symbols yet.
1047		optCopy := opt
1048		optCopy.DisableWordMatchOptimization = true
1049
1050		subMT, err := d.newMatchTree(s.Expr, optCopy)
1051		if err != nil {
1052			return nil, err
1053		}
1054
1055		if substr, ok := subMT.(*substrMatchTree); ok {
1056			return &symbolSubstrMatchTree{
1057				substrMatchTree: substr,
1058				patternSize:     uint32(utf8.RuneCountInString(substr.query.Pattern)),
1059				fileEndRunes:    d.fileEndRunes,
1060				fileEndSymbol:   d.fileEndSymbol,
1061				sections:        d.runeDocSections,
1062			}, nil
1063		}
1064
1065		var regexp *regexp.Regexp
1066		visitMatchTree(subMT, func(mt matchTree) {
1067			if t, ok := mt.(*regexpMatchTree); ok {
1068				regexp = t.regexp
1069			}
1070		})
1071		if regexp == nil {
1072			return nil, fmt.Errorf("found %T inside query.Symbol", subMT)
1073		}
1074
1075		return &symbolRegexpMatchTree{
1076			regexp:    regexp,
1077			all:       regexp.String() == "(?i)(?-s:.)*",
1078			matchTree: subMT,
1079		}, nil
1080
1081	case *query.FileNameSet:
1082		return &docMatchTree{
1083			reason:  "FileNameSet",
1084			numDocs: d.numDocs(),
1085			predicate: func(docID uint32) bool {
1086				fileName := d.fileName(docID)
1087				_, ok := s.Set[string(fileName)]
1088				return ok
1089			},
1090		}, nil
1091
1092	case *query.BranchesRepos:
1093		reposBranchesWant := make([]uint64, len(d.repoMetaData))
1094		for repoIdx := range d.repoMetaData {
1095			var mask uint64
1096			for _, br := range s.List {
1097				if br.Repos.Contains(d.repoMetaData[repoIdx].ID) {
1098					mask |= uint64(d.branchIDs[repoIdx][br.Branch])
1099				}
1100			}
1101			reposBranchesWant[repoIdx] = mask
1102		}
1103		return &docMatchTree{
1104			reason:  "BranchesRepos",
1105			numDocs: d.numDocs(),
1106			predicate: func(docID uint32) bool {
1107				return d.fileBranchMasks[docID]&reposBranchesWant[d.repos[docID]] != 0
1108			},
1109		}, nil
1110
1111	case *query.RepoSet:
1112		reposWant := make([]bool, len(d.repoMetaData))
1113		for repoIdx, r := range d.repoMetaData {
1114			if _, ok := s.Set[r.Name]; ok {
1115				reposWant[repoIdx] = true
1116			}
1117		}
1118		return &docMatchTree{
1119			reason:  "RepoSet",
1120			numDocs: d.numDocs(),
1121			predicate: func(docID uint32) bool {
1122				return reposWant[d.repos[docID]]
1123			},
1124		}, nil
1125
1126	case *query.RepoIDs:
1127		reposWant := make([]bool, len(d.repoMetaData))
1128		for repoIdx, r := range d.repoMetaData {
1129			if s.Repos.Contains(r.ID) {
1130				reposWant[repoIdx] = true
1131			}
1132		}
1133		return &docMatchTree{
1134			reason:  "RepoIDs",
1135			numDocs: d.numDocs(),
1136			predicate: func(docID uint32) bool {
1137				return reposWant[d.repos[docID]]
1138			},
1139		}, nil
1140
1141	case *query.Repo:
1142		reposWant := make([]bool, len(d.repoMetaData))
1143		for repoIdx, r := range d.repoMetaData {
1144			if s.Regexp.MatchString(r.Name) {
1145				reposWant[repoIdx] = true
1146			}
1147		}
1148		return &docMatchTree{
1149			reason:  "Repo",
1150			numDocs: d.numDocs(),
1151			predicate: func(docID uint32) bool {
1152				return reposWant[d.repos[docID]]
1153			},
1154		}, nil
1155
1156	case *query.RepoRegexp:
1157		reposWant := make([]bool, len(d.repoMetaData))
1158		for repoIdx, r := range d.repoMetaData {
1159			if s.Regexp.MatchString(r.Name) {
1160				reposWant[repoIdx] = true
1161			}
1162		}
1163		return &docMatchTree{
1164			reason:  "RepoRegexp",
1165			numDocs: d.numDocs(),
1166			predicate: func(docID uint32) bool {
1167				return reposWant[d.repos[docID]]
1168			},
1169		}, nil
1170
1171	case query.RawConfig:
1172		return &docMatchTree{
1173			reason:  s.String(),
1174			numDocs: d.numDocs(),
1175			predicate: func(docID uint32) bool {
1176				return uint8(s)&d.rawConfigMasks[d.repos[docID]] == uint8(s)
1177			},
1178		}, nil
1179	}
1180	log.Panicf("type %T", q)
1181	return nil, nil
1182}
1183
1184func (d *indexData) newSubstringMatchTree(s *query.Substring) (matchTree, error) {
1185	st := &substrMatchTree{
1186		query:         s,
1187		caseSensitive: s.CaseSensitive,
1188		fileName:      s.FileName,
1189	}
1190
1191	if utf8.RuneCountInString(s.Pattern) < ngramSize {
1192		prefix := ""
1193		if !s.CaseSensitive {
1194			prefix = "(?i)"
1195		}
1196		t := &regexpMatchTree{
1197			regexp:   regexp.MustCompile(prefix + regexp.QuoteMeta(s.Pattern)),
1198			fileName: s.FileName,
1199		}
1200		return t, nil
1201	}
1202
1203	result, err := d.iterateNgrams(s)
1204	if err != nil {
1205		return nil, err
1206	}
1207	st.matchIterator = result
1208	return st, nil
1209}
1210
1211func regexpToWordMatchTree(q *query.Regexp, opt matchTreeOpt) (_ *wordMatchTree, ok bool) {
1212	if opt.DisableWordMatchOptimization {
1213		return nil, false
1214	}
1215	// Needs to be case sensitive
1216	if !q.CaseSensitive || q.Regexp.Flags&syntax.FoldCase != 0 {
1217		return nil, false
1218	}
1219	// We want a regex that looks like Op.Concat[OpWordBoundary OpLiteral OpWordBoundary]
1220	if q.Regexp.Op != syntax.OpConcat || len(q.Regexp.Sub) != 3 {
1221		return nil, false
1222	}
1223	sub := q.Regexp.Sub
1224	if sub[0].Op != syntax.OpWordBoundary || sub[1].Op != syntax.OpLiteral || sub[2].Op != syntax.OpWordBoundary {
1225		return nil, false
1226	}
1227
1228	return &wordMatchTree{
1229		word:     string(sub[1].Rune),
1230		fileName: q.FileName,
1231	}, true
1232}
1233
1234// pruneMatchTree removes impossible branches from the matchTree, as indicated
1235// by substrMatchTree having a noMatchTree and the resulting impossible and clauses and so forth.
1236func pruneMatchTree(mt matchTree) (matchTree, error) {
1237	var err error
1238	switch mt := mt.(type) {
1239	// leaf nodes that we test with the filter:
1240	case *substrMatchTree:
1241		if res, ok := mt.matchIterator.(*ngramIterationResults); ok {
1242			if _, ok := res.matchIterator.(*noMatchTree); ok {
1243				return nil, nil
1244			}
1245		}
1246	// recursive tree structures:
1247	case *andMatchTree:
1248		// Any branch of an and becoming impossible means the entire clause
1249		// is impossible. Otherwise, just handle rewrites.
1250		for i, child := range mt.children {
1251			newChild, err := pruneMatchTree(child)
1252			if err != nil {
1253				return nil, err
1254			}
1255			if newChild == nil {
1256				return nil, nil
1257			}
1258			mt.children[i] = newChild
1259		}
1260	case *orMatchTree:
1261		// *All* branches of an OR becoming impossible means the entire clause
1262		// is impossible. Otherwise, drop impossible subclauses and handle
1263		// rewrites, including simplifying to a singular resulting child branch.
1264		n := 0
1265		for _, child := range mt.children {
1266			newChild, err := pruneMatchTree(child)
1267			if err != nil {
1268				return nil, err
1269			}
1270			if newChild != nil {
1271				mt.children[n] = newChild
1272				n++
1273			}
1274		}
1275		mt.children = mt.children[:n]
1276		if len(mt.children) == 1 {
1277			return mt.children[0], nil
1278		} else if len(mt.children) == 0 {
1279			return nil, nil
1280		}
1281	case *noVisitMatchTree:
1282		mt.matchTree, err = pruneMatchTree(mt.matchTree)
1283		if err != nil {
1284			return nil, err
1285		}
1286		if mt.matchTree == nil {
1287			return nil, nil
1288		}
1289	case *fileNameMatchTree:
1290		mt.child, err = pruneMatchTree(mt.child)
1291	case *andLineMatchTree:
1292		child, err := pruneMatchTree(&mt.andMatchTree)
1293		if err != nil {
1294			return nil, err
1295		}
1296		if child == nil {
1297			return nil, nil
1298		}
1299		if c, ok := child.(*andMatchTree); ok {
1300			mt.andMatchTree = *c
1301		} else {
1302			// the and was simplified to a single clause,
1303			// so the linematch portion is irrelevant.
1304			return mt, nil
1305		}
1306	case *notMatchTree:
1307		mt.child, err = pruneMatchTree(mt.child)
1308		if err != nil {
1309			return nil, err
1310		}
1311		if mt.child == nil {
1312			// not false => true
1313			return &bruteForceMatchTree{}, nil
1314		}
1315	// unhandled:
1316	case *docMatchTree:
1317	case *bruteForceMatchTree:
1318	case *regexpMatchTree:
1319	case *wordMatchTree:
1320	}
1321	return mt, err
1322}
Configure Feed

Configure Feed