matchtree.go at 19fa44cea9eb25f96f27805de6f952c6ac93407a · boltless.me/zoekt

fork of https://github.com/sourcegraph/zoekt
zoekt / matchtree.go
at 19fa44cea9eb25f96f27805de6f952c6ac93407a 33 kB View raw
Stefan Hengl fix: prune boostMatchTree if child is pruned (#732) 2y ago
   1// Copyright 2018 Google Inc. All rights reserved.
   2//
   3// Licensed under the Apache License, Version 2.0 (the "License");
   4// you may not use this file except in compliance with the License.
   5// You may obtain a copy of the License at
   6//
   7//    http://www.apache.org/licenses/LICENSE-2.0
   8//
   9// Unless required by applicable law or agreed to in writing, software
  10// distributed under the License is distributed on an "AS IS" BASIS,
  11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12// See the License for the specific language governing permissions and
  13// limitations under the License.
  14
  15package zoekt
  16
  17import (
  18	"bytes"
  19	"fmt"
  20	"log"
  21	"regexp/syntax"
  22	"strings"
  23	"unicode/utf8"
  24
  25	"github.com/grafana/regexp"
  26
  27	"github.com/sourcegraph/zoekt/query"
  28)
  29
  30// A docIterator iterates over documents in order.
  31type docIterator interface {
  32	// provide the next document where we may find something interesting.
  33	//
  34	// This is like a "peek" and shouldn't mutate state. prepare is what should
  35	// change state.
  36	nextDoc() uint32
  37
  38	// clears any per-document state of the docIterator, and
  39	// prepares for evaluating the given doc. The argument is
  40	// strictly increasing over time.
  41	prepare(nextDoc uint32)
  42}
  43
  44// costs are passed in increasing order to matchTree.matches until they do not
  45// return matchesRequiresHigherCost.
  46const (
  47	costConst   = 0
  48	costMemory  = 1
  49	costContent = 2
  50	costRegexp  = 3
  51)
  52
  53const (
  54	costMin = costConst
  55	costMax = costRegexp
  56)
  57
  58// matchesState is an enum for the state of a matchTree after a call to
  59// matchTree.matches.
  60type matchesState uint8
  61
  62const (
  63	// matchesRequiresHigherCost is returned when matchTree.matches hasn't done
  64	// a search yet since the cost value is not high enough.
  65	matchesRequiresHigherCost matchesState = iota
  66
  67	// matchesFound is returned when matchTree.matches has done a search and
  68	// found one or more matches.
  69	matchesFound
  70
  71	// matchesNone is returned when matchTree.matches has done a search and
  72	// found nothing.
  73	matchesNone
  74)
  75
  76// matchesStatePred is a helper which returns matchesFound if b is true
  77// otherwise returns matchesNone.
  78func matchesStatePred(b bool) matchesState {
  79	if b {
  80		return matchesFound
  81	}
  82	return matchesNone
  83}
  84
  85// matchesStateForSlice is a helper which returns matchesFound if v is
  86// non-empty otherwise returns matchesNone.
  87func matchesStateForSlice[T any](v []T) matchesState {
  88	return matchesStatePred(len(v) > 0)
  89}
  90
  91// An expression tree coupled with matches. The matchtree has two
  92// functions:
  93//
  94// * it implements boolean combinations (and, or, not)
  95//
  96// * it implements shortcuts, where we skip documents (for example: if
  97// there are no trigram matches, we can be sure there are no substring
  98// matches). The matchtree iterates over the documents as they are
  99// ordered in the shard.
 100//
 101// The general process for a given (shard, query) is:
 102//
 103// - construct matchTree for the query
 104//
 105// - find all different leaf matchTrees (substring, regexp, etc.)
 106//
 107// in a loop:
 108//
 109//   - find next doc to process using nextDoc
 110//
 111//   - evaluate atoms (leaf expressions that match text)
 112//
 113//   - evaluate the tree using matches(), storing the result in map.
 114//
 115//   - if the complete tree returns (matches() != matchesRequiresHigherCost)
 116//     for the document, collect all text matches by looking at leaf
 117//     matchTrees.
 118type matchTree interface {
 119	docIterator
 120
 121	// matches if cost is high enough. See documentation for matchesState's
 122	// values.
 123	//
 124	// Note: Do not call this directly, rather use evalMatchTree which uses
 125	// known to cache responses once the state transitions away from
 126	// matchesRequiresHigherCost.
 127	matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState
 128}
 129
 130// docMatchTree iterates over documents for which predicate(docID) returns true.
 131type docMatchTree struct {
 132	// the number of documents in a shard.
 133	numDocs uint32
 134
 135	predicate func(docID uint32) bool
 136
 137	// provides additional information about the reason why the docMatchTree was
 138	// created.
 139	reason string
 140
 141	// mutable
 142	firstDone bool
 143	docID     uint32
 144}
 145
 146type bruteForceMatchTree struct {
 147	// mutable
 148	firstDone bool
 149	docID     uint32
 150}
 151
 152type andLineMatchTree struct {
 153	andMatchTree
 154}
 155
 156type andMatchTree struct {
 157	children []matchTree
 158}
 159
 160type orMatchTree struct {
 161	children []matchTree
 162}
 163
 164type notMatchTree struct {
 165	child matchTree
 166}
 167
 168// Returns only the filename of child matches.
 169type fileNameMatchTree struct {
 170	child matchTree
 171}
 172
 173type boostMatchTree struct {
 174	child matchTree
 175	boost float64
 176}
 177
 178// Don't visit this subtree for collecting matches.
 179type noVisitMatchTree struct {
 180	matchTree
 181}
 182
 183type regexpMatchTree struct {
 184	regexp *regexp.Regexp
 185
 186	fileName bool
 187
 188	// mutable
 189	reEvaluated bool
 190	found       []*candidateMatch
 191
 192	// nextDoc, prepare.
 193	bruteForceMatchTree
 194}
 195
 196// \bLITERAL\b
 197type wordMatchTree struct {
 198	word string
 199
 200	fileName bool
 201
 202	// mutable
 203	evaluated bool
 204	found     []*candidateMatch
 205
 206	// nextDoc, prepare.
 207	bruteForceMatchTree
 208}
 209
 210type substrMatchTree struct {
 211	matchIterator
 212
 213	query         *query.Substring
 214	caseSensitive bool
 215	fileName      bool
 216
 217	// mutable
 218	current       []*candidateMatch
 219	contEvaluated bool
 220}
 221
 222type branchQueryMatchTree struct {
 223	fileMasks []uint64
 224	masks     []uint64
 225	repos     []uint16
 226
 227	// mutable
 228	firstDone bool
 229	docID     uint32
 230}
 231
 232func (t *branchQueryMatchTree) branchMask() uint64 {
 233	return t.fileMasks[t.docID] & t.masks[t.repos[t.docID]]
 234}
 235
 236type symbolRegexpMatchTree struct {
 237	matchTree
 238	regexp *regexp.Regexp
 239	all    bool // skips regex match if .*
 240
 241	reEvaluated bool
 242	found       []*candidateMatch
 243}
 244
 245func (t *symbolRegexpMatchTree) prepare(doc uint32) {
 246	t.reEvaluated = false
 247	t.found = t.found[:0]
 248	t.matchTree.prepare(doc)
 249}
 250
 251func (t *symbolRegexpMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
 252	if t.reEvaluated {
 253		return matchesStateForSlice(t.found)
 254	}
 255
 256	if cost < costRegexp {
 257		return matchesRequiresHigherCost
 258	}
 259
 260	sections := cp.docSections()
 261	content := cp.data(false)
 262
 263	found := t.found[:0]
 264	for i, sec := range sections {
 265		var idx []int
 266		if t.all {
 267			idx = []int{0, int(sec.End - sec.Start)}
 268		} else {
 269			idx = t.regexp.FindIndex(content[sec.Start:sec.End])
 270			if idx == nil {
 271				continue
 272			}
 273		}
 274
 275		cm := &candidateMatch{
 276			byteOffset:  sec.Start + uint32(idx[0]),
 277			byteMatchSz: uint32(idx[1] - idx[0]),
 278			symbol:      true,
 279			symbolIdx:   uint32(i),
 280		}
 281		found = append(found, cm)
 282	}
 283	t.found = found
 284	t.reEvaluated = true
 285
 286	return matchesStateForSlice(t.found)
 287}
 288
 289type symbolSubstrMatchTree struct {
 290	*substrMatchTree
 291
 292	patternSize   uint32
 293	fileEndRunes  []uint32
 294	fileEndSymbol []uint32
 295
 296	doc      uint32
 297	sections []DocumentSection
 298
 299	secID uint32
 300}
 301
 302func (t *symbolSubstrMatchTree) prepare(doc uint32) {
 303	t.substrMatchTree.prepare(doc)
 304	t.doc = doc
 305
 306	var fileStart uint32
 307	if doc > 0 {
 308		fileStart = t.fileEndRunes[doc-1]
 309	}
 310
 311	var sections []DocumentSection
 312	if len(t.sections) > 0 {
 313		most := t.fileEndSymbol[len(t.fileEndSymbol)-1]
 314		if most == uint32(len(t.sections)) {
 315			sections = t.sections[t.fileEndSymbol[doc]:t.fileEndSymbol[doc+1]]
 316		} else {
 317			for t.secID < uint32(len(t.sections)) && t.sections[t.secID].Start < fileStart {
 318				t.secID++
 319			}
 320
 321			fileEnd, symbolEnd := t.fileEndRunes[doc], t.secID
 322			for symbolEnd < uint32(len(t.sections)) && t.sections[symbolEnd].Start < fileEnd {
 323				symbolEnd++
 324			}
 325
 326			sections = t.sections[t.secID:symbolEnd]
 327		}
 328	}
 329
 330	secIdx := 0
 331	trimmed := t.current[:0]
 332	for len(sections) > secIdx && len(t.current) > 0 {
 333		start := fileStart + t.current[0].runeOffset
 334		end := start + t.patternSize
 335		if start >= sections[secIdx].End {
 336			secIdx++
 337			continue
 338		}
 339
 340		if start < sections[secIdx].Start {
 341			t.current = t.current[1:]
 342			continue
 343		}
 344
 345		if end <= sections[secIdx].End {
 346			t.current[0].symbol = true
 347			t.current[0].symbolIdx = uint32(secIdx)
 348			trimmed = append(trimmed, t.current[0])
 349		}
 350
 351		t.current = t.current[1:]
 352	}
 353	t.current = trimmed
 354}
 355
 356// all prepare methods
 357
 358func (t *bruteForceMatchTree) prepare(doc uint32) {
 359	t.docID = doc
 360	t.firstDone = true
 361}
 362
 363func (t *docMatchTree) prepare(doc uint32) {
 364	t.docID = doc
 365	t.firstDone = true
 366}
 367
 368func (t *andMatchTree) prepare(doc uint32) {
 369	for _, c := range t.children {
 370		c.prepare(doc)
 371	}
 372}
 373
 374func (t *regexpMatchTree) prepare(doc uint32) {
 375	t.found = t.found[:0]
 376	t.reEvaluated = false
 377	t.bruteForceMatchTree.prepare(doc)
 378}
 379
 380func (t *wordMatchTree) prepare(doc uint32) {
 381	t.found = t.found[:0]
 382	t.evaluated = false
 383	t.bruteForceMatchTree.prepare(doc)
 384}
 385
 386func (t *orMatchTree) prepare(doc uint32) {
 387	for _, c := range t.children {
 388		c.prepare(doc)
 389	}
 390}
 391
 392func (t *notMatchTree) prepare(doc uint32) {
 393	t.child.prepare(doc)
 394}
 395
 396func (t *fileNameMatchTree) prepare(doc uint32) {
 397	t.child.prepare(doc)
 398}
 399
 400func (t *boostMatchTree) prepare(doc uint32) {
 401	t.child.prepare(doc)
 402}
 403
 404func (t *substrMatchTree) prepare(nextDoc uint32) {
 405	t.matchIterator.prepare(nextDoc)
 406	t.current = t.matchIterator.candidates()
 407	t.contEvaluated = false
 408}
 409
 410func (t *branchQueryMatchTree) prepare(doc uint32) {
 411	t.firstDone = true
 412	t.docID = doc
 413}
 414
 415// nextDoc
 416
 417func (t *docMatchTree) nextDoc() uint32 {
 418	var start uint32
 419	if t.firstDone {
 420		start = t.docID + 1
 421	}
 422	for i := start; i < t.numDocs; i++ {
 423		if t.predicate(i) {
 424			return i
 425		}
 426	}
 427	return maxUInt32
 428}
 429
 430func (t *bruteForceMatchTree) nextDoc() uint32 {
 431	if !t.firstDone {
 432		return 0
 433	}
 434	return t.docID + 1
 435}
 436
 437func (t *andMatchTree) nextDoc() uint32 {
 438	var max uint32
 439	for _, c := range t.children {
 440		m := c.nextDoc()
 441		if m > max {
 442			max = m
 443		}
 444	}
 445	return max
 446}
 447
 448func (t *orMatchTree) nextDoc() uint32 {
 449	min := uint32(maxUInt32)
 450	for _, c := range t.children {
 451		m := c.nextDoc()
 452		if m < min {
 453			min = m
 454		}
 455	}
 456	return min
 457}
 458
 459func (t *notMatchTree) nextDoc() uint32 {
 460	return 0
 461}
 462
 463func (t *fileNameMatchTree) nextDoc() uint32 {
 464	return t.child.nextDoc()
 465}
 466
 467func (t *boostMatchTree) nextDoc() uint32 {
 468	return t.child.nextDoc()
 469}
 470
 471func (t *branchQueryMatchTree) nextDoc() uint32 {
 472	var start uint32
 473	if t.firstDone {
 474		start = t.docID + 1
 475	}
 476
 477	for i := start; i < uint32(len(t.fileMasks)); i++ {
 478		if (t.masks[t.repos[i]] & t.fileMasks[i]) != 0 {
 479			return i
 480		}
 481	}
 482	return maxUInt32
 483}
 484
 485// all String methods
 486
 487func (t *bruteForceMatchTree) String() string {
 488	return "all"
 489}
 490
 491func (t *docMatchTree) String() string {
 492	return fmt.Sprintf("doc(%s)", t.reason)
 493}
 494
 495func (t *andMatchTree) String() string {
 496	return fmt.Sprintf("and%v", t.children)
 497}
 498
 499func (t *regexpMatchTree) String() string {
 500	f := ""
 501	if t.fileName {
 502		f = "f"
 503	}
 504	return fmt.Sprintf("%sre(%s)", f, t.regexp)
 505}
 506
 507func (t *wordMatchTree) String() string {
 508	f := ""
 509	if t.fileName {
 510		f = "f"
 511	}
 512	return fmt.Sprintf("%sword(%s)", f, t.word)
 513}
 514
 515func (t *orMatchTree) String() string {
 516	return fmt.Sprintf("or%v", t.children)
 517}
 518
 519func (t *notMatchTree) String() string {
 520	return fmt.Sprintf("not(%v)", t.child)
 521}
 522
 523func (t *noVisitMatchTree) String() string {
 524	return fmt.Sprintf("novisit(%v)", t.matchTree)
 525}
 526
 527func (t *fileNameMatchTree) String() string {
 528	return fmt.Sprintf("f(%v)", t.child)
 529}
 530
 531func (t *boostMatchTree) String() string {
 532	return fmt.Sprintf("boost(%f, %v)", t.boost, t.child)
 533}
 534
 535func (t *substrMatchTree) String() string {
 536	f := ""
 537	if t.fileName {
 538		f = "f"
 539	}
 540
 541	return fmt.Sprintf("%ssubstr(%q, %v, %v)", f, t.query.Pattern, t.current, t.matchIterator)
 542}
 543
 544func (t *branchQueryMatchTree) String() string {
 545	return fmt.Sprintf("branch(%x)", t.masks)
 546}
 547
 548func (t *symbolSubstrMatchTree) String() string {
 549	return fmt.Sprintf("symbol(%v)", t.substrMatchTree)
 550}
 551
 552func (t *symbolRegexpMatchTree) String() string {
 553	return fmt.Sprintf("symbol(%v)", t.matchTree)
 554}
 555
 556// visitMatches visits all atoms in matchTree. Note: This visits
 557// noVisitMatchTree. For collecting matches use visitMatches.
 558func visitMatchTree(t matchTree, f func(matchTree)) {
 559	switch s := t.(type) {
 560	case *andMatchTree:
 561		for _, ch := range s.children {
 562			visitMatchTree(ch, f)
 563		}
 564	case *orMatchTree:
 565		for _, ch := range s.children {
 566			visitMatchTree(ch, f)
 567		}
 568	case *andLineMatchTree:
 569		visitMatchTree(&s.andMatchTree, f)
 570	case *noVisitMatchTree:
 571		visitMatchTree(s.matchTree, f)
 572	case *notMatchTree:
 573		visitMatchTree(s.child, f)
 574	case *fileNameMatchTree:
 575		visitMatchTree(s.child, f)
 576	case *boostMatchTree:
 577		visitMatchTree(s.child, f)
 578	case *symbolSubstrMatchTree:
 579		visitMatchTree(s.substrMatchTree, f)
 580	case *symbolRegexpMatchTree:
 581		visitMatchTree(s.matchTree, f)
 582	default:
 583		f(t)
 584	}
 585}
 586
 587// updateMatchTreeStats calls updateStats on all atoms in mt which have that
 588// function defined.
 589func updateMatchTreeStats(mt matchTree, stats *Stats) {
 590	visitMatchTree(mt, func(mt matchTree) {
 591		if atom, ok := mt.(interface{ updateStats(*Stats) }); ok {
 592			atom.updateStats(stats)
 593		}
 594	})
 595}
 596
 597func visitMatchAtoms(t matchTree, known map[matchTree]bool, f func(matchTree)) {
 598	visitMatches(t, known, 1, func(mt matchTree, _ float64) {
 599		f(mt)
 600	})
 601}
 602
 603// visitMatches visits all atoms which can contribute matches. Note: This
 604// skips noVisitMatchTree.
 605func visitMatches(t matchTree, known map[matchTree]bool, weight float64, f func(matchTree, float64)) {
 606	switch s := t.(type) {
 607	case *andMatchTree:
 608		for _, ch := range s.children {
 609			if known[ch] {
 610				visitMatches(ch, known, weight, f)
 611			}
 612		}
 613	case *andLineMatchTree:
 614		visitMatches(&s.andMatchTree, known, weight, f)
 615	case *orMatchTree:
 616		for _, ch := range s.children {
 617			if known[ch] {
 618				visitMatches(ch, known, weight, f)
 619			}
 620		}
 621	case *boostMatchTree:
 622		visitMatches(s.child, known, weight*s.boost, f)
 623	case *symbolSubstrMatchTree:
 624		visitMatches(s.substrMatchTree, known, weight, f)
 625	case *notMatchTree:
 626	case *noVisitMatchTree:
 627		// don't collect into negative trees.
 628	case *fileNameMatchTree:
 629		// We will just gather the filename if we do not visit this tree.
 630	default:
 631		f(s, weight)
 632	}
 633}
 634
 635// all matches() methods.
 636
 637func (t *docMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
 638	return matchesStatePred(t.predicate(cp.idx))
 639}
 640
 641func (t *bruteForceMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
 642	return matchesFound
 643}
 644
 645// andLineMatchTree is a performance optimization of andMatchTree. For content
 646// searches we don't want to run the regex engine if there is no line that
 647// contains matches from all terms.
 648func (t *andLineMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
 649	if state := evalMatchTree(cp, cost, known, &t.andMatchTree); state != matchesFound {
 650		return state
 651	}
 652
 653	// Invariant: all children have matches. If any line contains all of them we
 654	// can return MatchesFound.
 655
 656	// find child with fewest candidates
 657	min := maxUInt32
 658	fewestChildren := 0
 659	for ix, child := range t.children {
 660		v, ok := child.(*substrMatchTree)
 661		// make sure we are running a content search and that all candidates are a
 662		// substrMatchTree
 663		if !ok || v.fileName {
 664			return matchesFound
 665		}
 666		if len(v.current) < min {
 667			min = len(v.current)
 668			fewestChildren = ix
 669		}
 670	}
 671
 672	type lineRange struct {
 673		start int
 674		end   int
 675	}
 676	lines := make([]lineRange, 0, len(t.children[fewestChildren].(*substrMatchTree).current))
 677	prev := -1
 678	for _, candidate := range t.children[fewestChildren].(*substrMatchTree).current {
 679		line, byteStart, byteEnd := cp.newlines().atOffset(candidate.byteOffset)
 680		if line == prev {
 681			continue
 682		}
 683		prev = line
 684		lines = append(lines, lineRange{byteStart, byteEnd})
 685	}
 686
 687	// children keeps track of the children's candidates we have already seen.
 688	children := make([][]*candidateMatch, 0, len(t.children)-1)
 689	for j, child := range t.children {
 690		if j == fewestChildren {
 691			continue
 692		}
 693		children = append(children, child.(*substrMatchTree).current)
 694	}
 695
 696nextLine:
 697	for i := 0; i < len(lines); i++ {
 698		hits := 1
 699	nextChild:
 700		for j := range children {
 701		nextCandidate:
 702			for len(children[j]) > 0 {
 703				candidate := children[j][0]
 704				bo := int(cp.findOffset(false, candidate.runeOffset))
 705				if bo < lines[i].start {
 706					children[j] = children[j][1:]
 707					continue nextCandidate
 708				}
 709				if bo <= lines[i].end {
 710					hits++
 711					continue nextChild
 712				}
 713				// move the `lines` iterator forward until bo <= line.end
 714				for i < len(lines) && bo > lines[i].end {
 715					i++
 716				}
 717				i--
 718				continue nextLine
 719			}
 720		}
 721		// return early once we found any line that contains matches from all children
 722		if hits == len(t.children) {
 723			return matchesFound
 724		}
 725	}
 726	return matchesNone
 727}
 728
 729func (t *andMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
 730	// We have found matches unless a child needs to do more work or it hasn't
 731	// found matches.
 732	state := matchesFound
 733
 734	for _, ch := range t.children {
 735		switch evalMatchTree(cp, cost, known, ch) {
 736		case matchesRequiresHigherCost:
 737			// keep evaluating other children incase we come across matchesNone
 738			state = matchesRequiresHigherCost
 739		case matchesFound:
 740			// will return this if every child has this value
 741		case matchesNone:
 742			return matchesNone
 743		}
 744	}
 745
 746	return state
 747}
 748
 749func (t *orMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
 750	// we could short-circuit, but we want to use the other possibilities as a
 751	// ranking signal. So we always return the most conservative state.
 752	state := matchesNone
 753	for _, ch := range t.children {
 754		switch evalMatchTree(cp, cost, known, ch) {
 755		case matchesRequiresHigherCost:
 756			state = matchesRequiresHigherCost
 757		case matchesFound:
 758			if state != matchesRequiresHigherCost {
 759				state = matchesFound
 760			}
 761		case matchesNone:
 762			// noop
 763		}
 764	}
 765	return state
 766}
 767
 768func (t *branchQueryMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
 769	return matchesStatePred(t.branchMask() != 0)
 770}
 771
 772func (t *regexpMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
 773	if t.reEvaluated {
 774		return matchesStateForSlice(t.found)
 775	}
 776
 777	if cost < costRegexp {
 778		return matchesRequiresHigherCost
 779	}
 780
 781	cp.stats.RegexpsConsidered++
 782	idxs := t.regexp.FindAllIndex(cp.data(t.fileName), -1)
 783	found := t.found[:0]
 784	for _, idx := range idxs {
 785		cm := &candidateMatch{
 786			byteOffset:  uint32(idx[0]),
 787			byteMatchSz: uint32(idx[1] - idx[0]),
 788			fileName:    t.fileName,
 789		}
 790
 791		found = append(found, cm)
 792	}
 793	t.found = found
 794	t.reEvaluated = true
 795
 796	return matchesStateForSlice(t.found)
 797}
 798
 799func (t *wordMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
 800	if t.evaluated {
 801		return matchesStateForSlice(t.found)
 802	}
 803
 804	if cost < costRegexp {
 805		return matchesRequiresHigherCost
 806	}
 807
 808	data := cp.data(t.fileName)
 809	offset := 0
 810	found := t.found[:0]
 811	for {
 812		idx := bytes.Index(data[offset:], []byte(t.word))
 813		if idx < 0 {
 814			break
 815		}
 816
 817		relStartOffset := offset + idx
 818		relEndOffset := relStartOffset + len(t.word)
 819
 820		startBoundary := relStartOffset < len(data) && (relStartOffset == 0 || !characterClass(data[relStartOffset-1]))
 821		endBoundary := relEndOffset > 0 && (relEndOffset == len(data) || !characterClass(data[relEndOffset]))
 822		if startBoundary && endBoundary {
 823			found = append(found, &candidateMatch{
 824				byteOffset:  uint32(offset + idx),
 825				byteMatchSz: uint32(len(t.word)),
 826				fileName:    t.fileName,
 827			})
 828		}
 829		offset += idx + len(t.word)
 830	}
 831
 832	t.found = found
 833	t.evaluated = true
 834
 835	return matchesStateForSlice(t.found)
 836}
 837
 838// breakMatchesOnNewlines returns matches resulting from breaking each element
 839// of cms on newlines within text.
 840func breakMatchesOnNewlines(cms []*candidateMatch, text []byte) []*candidateMatch {
 841	var lineCMs []*candidateMatch
 842	for _, cm := range cms {
 843		lineCMs = append(lineCMs, breakOnNewlines(cm, text)...)
 844	}
 845	return lineCMs
 846}
 847
 848// breakOnNewlines returns matches resulting from breaking cm on newlines
 849// within text.
 850func breakOnNewlines(cm *candidateMatch, text []byte) []*candidateMatch {
 851	var cms []*candidateMatch
 852	addMe := &candidateMatch{}
 853	*addMe = *cm
 854	for i := uint32(cm.byteOffset); i < cm.byteOffset+cm.byteMatchSz; i++ {
 855		if text[i] == '\n' {
 856			addMe.byteMatchSz = i - addMe.byteOffset
 857			if addMe.byteMatchSz != 0 {
 858				cms = append(cms, addMe)
 859			}
 860
 861			addMe = &candidateMatch{}
 862			*addMe = *cm
 863			addMe.byteOffset = i + 1
 864		}
 865	}
 866	addMe.byteMatchSz = cm.byteOffset + cm.byteMatchSz - addMe.byteOffset
 867	if addMe.byteMatchSz != 0 {
 868		cms = append(cms, addMe)
 869	}
 870	return cms
 871}
 872
 873// evalMatchTree should be called instead of directly calling
 874// matchTree.matches. It cache known values for future evaluation at higher
 875// costs.
 876func evalMatchTree(cp *contentProvider, cost int, known map[matchTree]bool, mt matchTree) matchesState {
 877	if v, ok := known[mt]; ok {
 878		return matchesStatePred(v)
 879	}
 880
 881	ms := mt.matches(cp, cost, known)
 882	if ms != matchesRequiresHigherCost {
 883		known[mt] = ms == matchesFound
 884	}
 885
 886	return ms
 887}
 888
 889func (t *notMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
 890	switch evalMatchTree(cp, cost, known, t.child) {
 891	case matchesRequiresHigherCost:
 892		return matchesRequiresHigherCost
 893	case matchesFound:
 894		return matchesNone
 895	case matchesNone:
 896		return matchesFound
 897	default:
 898		panic("unreachable")
 899	}
 900}
 901
 902func (t *fileNameMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
 903	return evalMatchTree(cp, cost, known, t.child)
 904}
 905
 906func (t *boostMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
 907	return evalMatchTree(cp, cost, known, t.child)
 908}
 909
 910func (t *substrMatchTree) matches(cp *contentProvider, cost int, known map[matchTree]bool) matchesState {
 911	if t.contEvaluated {
 912		return matchesStateForSlice(t.current)
 913	}
 914
 915	if len(t.current) == 0 {
 916		return matchesNone
 917	}
 918
 919	if t.fileName && cost < costMemory {
 920		return matchesRequiresHigherCost
 921	}
 922
 923	if !t.fileName && cost < costContent {
 924		return matchesRequiresHigherCost
 925	}
 926
 927	pruned := t.current[:0]
 928	for _, m := range t.current {
 929		if m.byteOffset == 0 && m.runeOffset > 0 {
 930			m.byteOffset = cp.findOffset(m.fileName, m.runeOffset)
 931		}
 932		if m.matchContent(cp.data(m.fileName)) {
 933			pruned = append(pruned, m)
 934		}
 935	}
 936	t.current = pruned
 937	t.contEvaluated = true
 938
 939	return matchesStateForSlice(t.current)
 940}
 941
 942type matchTreeOpt struct {
 943	// DisableWordMatchOptimization is used to disable the use of wordMatchTree.
 944	// This was added since we do not support wordMatchTree with symbol search.
 945	DisableWordMatchOptimization bool
 946}
 947
 948func (d *indexData) newMatchTree(q query.Q, opt matchTreeOpt) (matchTree, error) {
 949	if q == nil {
 950		return nil, fmt.Errorf("got nil (sub)query")
 951	}
 952	switch s := q.(type) {
 953	case *query.Regexp:
 954		// RegexpToMatchTreeRecursive tries to distill a matchTree that matches a
 955		// superset of the regexp. If the returned matchTree is equivalent to the
 956		// original regexp, it returns true. An equivalent matchTree has the same
 957		// behaviour as the original regexp and can be used instead.
 958		//
 959		subMT, isEq, _, err := d.regexpToMatchTreeRecursive(s.Regexp, ngramSize, s.FileName, s.CaseSensitive)
 960		if err != nil {
 961			return nil, err
 962		}
 963		// if the query can be used in place of the regexp
 964		// return the subtree
 965		if isEq {
 966			return subMT, nil
 967		}
 968
 969		var tr matchTree
 970		if wmt, ok := regexpToWordMatchTree(s, opt); ok {
 971			// A common search we get is "\bLITERAL\b". Avoid the regex engine and
 972			// provide something faster.
 973			tr = wmt
 974		} else {
 975			prefix := ""
 976			if !s.CaseSensitive {
 977				prefix = "(?i)"
 978			}
 979
 980			tr = &regexpMatchTree{
 981				regexp:   regexp.MustCompile(prefix + s.Regexp.String()),
 982				fileName: s.FileName,
 983			}
 984		}
 985
 986		return &andMatchTree{
 987			children: []matchTree{
 988				tr, &noVisitMatchTree{subMT},
 989			},
 990		}, nil
 991	case *query.And:
 992		var r []matchTree
 993		for _, ch := range s.Children {
 994			ct, err := d.newMatchTree(ch, opt)
 995			if err != nil {
 996				return nil, err
 997			}
 998			r = append(r, ct)
 999		}
1000		return &andMatchTree{r}, nil
1001	case *query.Or:
1002		var r []matchTree
1003		for _, ch := range s.Children {
1004			ct, err := d.newMatchTree(ch, opt)
1005			if err != nil {
1006				return nil, err
1007			}
1008			r = append(r, ct)
1009		}
1010		return &orMatchTree{r}, nil
1011	case *query.Not:
1012		ct, err := d.newMatchTree(s.Child, opt)
1013		return &notMatchTree{
1014			child: ct,
1015		}, err
1016
1017	case *query.Type:
1018		if s.Type != query.TypeFileName {
1019			break
1020		}
1021
1022		ct, err := d.newMatchTree(s.Child, opt)
1023		if err != nil {
1024			return nil, err
1025		}
1026
1027		return &fileNameMatchTree{
1028			child: ct,
1029		}, nil
1030
1031	case *query.Boost:
1032		ct, err := d.newMatchTree(s.Child, opt)
1033		if err != nil {
1034			return nil, err
1035		}
1036
1037		return &boostMatchTree{
1038			child: ct,
1039			boost: s.Boost,
1040		}, nil
1041
1042	case *query.Substring:
1043		return d.newSubstringMatchTree(s)
1044
1045	case *query.Branch:
1046		masks := make([]uint64, 0, len(d.repoMetaData))
1047		if s.Pattern == "HEAD" {
1048			for i := 0; i < len(d.repoMetaData); i++ {
1049				masks = append(masks, 1)
1050			}
1051		} else {
1052			for _, branchIDs := range d.branchIDs {
1053				mask := uint64(0)
1054				for nm, m := range branchIDs {
1055					if (s.Exact && nm == s.Pattern) || (!s.Exact && strings.Contains(nm, s.Pattern)) {
1056						mask |= uint64(m)
1057					}
1058				}
1059				masks = append(masks, mask)
1060			}
1061
1062		}
1063		return &branchQueryMatchTree{
1064			masks:     masks,
1065			fileMasks: d.fileBranchMasks,
1066			repos:     d.repos,
1067		}, nil
1068	case *query.Const:
1069		if s.Value {
1070			return &bruteForceMatchTree{}, nil
1071		} else {
1072			return &noMatchTree{Why: "const"}, nil
1073		}
1074	case *query.Language:
1075		code, ok := d.metaData.LanguageMap[s.Language]
1076		if !ok {
1077			return &noMatchTree{Why: "lang"}, nil
1078		}
1079		return &docMatchTree{
1080			reason:  "language",
1081			numDocs: d.numDocs(),
1082			predicate: func(docID uint32) bool {
1083				return d.getLanguage(docID) == code
1084			},
1085		}, nil
1086
1087	case *query.Symbol:
1088		// Disable WordMatchTree since we don't support it in symbols yet.
1089		optCopy := opt
1090		optCopy.DisableWordMatchOptimization = true
1091
1092		subMT, err := d.newMatchTree(s.Expr, optCopy)
1093		if err != nil {
1094			return nil, err
1095		}
1096
1097		if substr, ok := subMT.(*substrMatchTree); ok {
1098			return &symbolSubstrMatchTree{
1099				substrMatchTree: substr,
1100				patternSize:     uint32(utf8.RuneCountInString(substr.query.Pattern)),
1101				fileEndRunes:    d.fileEndRunes,
1102				fileEndSymbol:   d.fileEndSymbol,
1103				sections:        d.runeDocSections,
1104			}, nil
1105		}
1106
1107		var regexp *regexp.Regexp
1108		visitMatchTree(subMT, func(mt matchTree) {
1109			if t, ok := mt.(*regexpMatchTree); ok {
1110				regexp = t.regexp
1111			}
1112		})
1113		if regexp == nil {
1114			return nil, fmt.Errorf("found %T inside query.Symbol", subMT)
1115		}
1116
1117		return &symbolRegexpMatchTree{
1118			regexp:    regexp,
1119			all:       regexp.String() == "(?i)(?-s:.)*",
1120			matchTree: subMT,
1121		}, nil
1122
1123	case *query.FileNameSet:
1124		return &docMatchTree{
1125			reason:  "FileNameSet",
1126			numDocs: d.numDocs(),
1127			predicate: func(docID uint32) bool {
1128				fileName := d.fileName(docID)
1129				_, ok := s.Set[string(fileName)]
1130				return ok
1131			},
1132		}, nil
1133
1134	case *query.BranchesRepos:
1135		reposBranchesWant := make([]uint64, len(d.repoMetaData))
1136		for repoIdx := range d.repoMetaData {
1137			var mask uint64
1138			for _, br := range s.List {
1139				if br.Repos.Contains(d.repoMetaData[repoIdx].ID) {
1140					mask |= uint64(d.branchIDs[repoIdx][br.Branch])
1141				}
1142			}
1143			reposBranchesWant[repoIdx] = mask
1144		}
1145		return &docMatchTree{
1146			reason:  "BranchesRepos",
1147			numDocs: d.numDocs(),
1148			predicate: func(docID uint32) bool {
1149				return d.fileBranchMasks[docID]&reposBranchesWant[d.repos[docID]] != 0
1150			},
1151		}, nil
1152
1153	case *query.RepoSet:
1154		reposWant := make([]bool, len(d.repoMetaData))
1155		for repoIdx, r := range d.repoMetaData {
1156			if _, ok := s.Set[r.Name]; ok {
1157				reposWant[repoIdx] = true
1158			}
1159		}
1160		return &docMatchTree{
1161			reason:  "RepoSet",
1162			numDocs: d.numDocs(),
1163			predicate: func(docID uint32) bool {
1164				return reposWant[d.repos[docID]]
1165			},
1166		}, nil
1167
1168	case *query.RepoIDs:
1169		reposWant := make([]bool, len(d.repoMetaData))
1170		for repoIdx, r := range d.repoMetaData {
1171			if s.Repos.Contains(r.ID) {
1172				reposWant[repoIdx] = true
1173			}
1174		}
1175		return &docMatchTree{
1176			reason:  "RepoIDs",
1177			numDocs: d.numDocs(),
1178			predicate: func(docID uint32) bool {
1179				return reposWant[d.repos[docID]]
1180			},
1181		}, nil
1182
1183	case *query.Repo:
1184		reposWant := make([]bool, len(d.repoMetaData))
1185		for repoIdx, r := range d.repoMetaData {
1186			if s.Regexp.MatchString(r.Name) {
1187				reposWant[repoIdx] = true
1188			}
1189		}
1190		return &docMatchTree{
1191			reason:  "Repo",
1192			numDocs: d.numDocs(),
1193			predicate: func(docID uint32) bool {
1194				return reposWant[d.repos[docID]]
1195			},
1196		}, nil
1197
1198	case *query.RepoRegexp:
1199		reposWant := make([]bool, len(d.repoMetaData))
1200		for repoIdx, r := range d.repoMetaData {
1201			if s.Regexp.MatchString(r.Name) {
1202				reposWant[repoIdx] = true
1203			}
1204		}
1205		return &docMatchTree{
1206			reason:  "RepoRegexp",
1207			numDocs: d.numDocs(),
1208			predicate: func(docID uint32) bool {
1209				return reposWant[d.repos[docID]]
1210			},
1211		}, nil
1212
1213	case query.RawConfig:
1214		return &docMatchTree{
1215			reason:  s.String(),
1216			numDocs: d.numDocs(),
1217			predicate: func(docID uint32) bool {
1218				return uint8(s)&d.rawConfigMasks[d.repos[docID]] == uint8(s)
1219			},
1220		}, nil
1221	}
1222	log.Panicf("type %T", q)
1223	return nil, nil
1224}
1225
1226func (d *indexData) newSubstringMatchTree(s *query.Substring) (matchTree, error) {
1227	st := &substrMatchTree{
1228		query:         s,
1229		caseSensitive: s.CaseSensitive,
1230		fileName:      s.FileName,
1231	}
1232
1233	if utf8.RuneCountInString(s.Pattern) < ngramSize {
1234		prefix := ""
1235		if !s.CaseSensitive {
1236			prefix = "(?i)"
1237		}
1238		t := &regexpMatchTree{
1239			regexp:   regexp.MustCompile(prefix + regexp.QuoteMeta(s.Pattern)),
1240			fileName: s.FileName,
1241		}
1242		return t, nil
1243	}
1244
1245	result, err := d.iterateNgrams(s)
1246	if err != nil {
1247		return nil, err
1248	}
1249	st.matchIterator = result
1250	return st, nil
1251}
1252
1253func regexpToWordMatchTree(q *query.Regexp, opt matchTreeOpt) (_ *wordMatchTree, ok bool) {
1254	if opt.DisableWordMatchOptimization {
1255		return nil, false
1256	}
1257	// Needs to be case sensitive
1258	if !q.CaseSensitive || q.Regexp.Flags&syntax.FoldCase != 0 {
1259		return nil, false
1260	}
1261	// We want a regex that looks like Op.Concat[OpWordBoundary OpLiteral OpWordBoundary]
1262	if q.Regexp.Op != syntax.OpConcat || len(q.Regexp.Sub) != 3 {
1263		return nil, false
1264	}
1265	sub := q.Regexp.Sub
1266	if sub[0].Op != syntax.OpWordBoundary || sub[1].Op != syntax.OpLiteral || sub[2].Op != syntax.OpWordBoundary {
1267		return nil, false
1268	}
1269
1270	return &wordMatchTree{
1271		word:     string(sub[1].Rune),
1272		fileName: q.FileName,
1273	}, true
1274}
1275
1276// pruneMatchTree removes impossible branches from the matchTree, as indicated
1277// by substrMatchTree having a noMatchTree and the resulting impossible and clauses and so forth.
1278func pruneMatchTree(mt matchTree) (matchTree, error) {
1279	var err error
1280	switch mt := mt.(type) {
1281	// leaf nodes that we test with the filter:
1282	case *substrMatchTree:
1283		if res, ok := mt.matchIterator.(*ngramIterationResults); ok {
1284			if _, ok := res.matchIterator.(*noMatchTree); ok {
1285				return nil, nil
1286			}
1287		}
1288	// recursive tree structures:
1289	case *andMatchTree:
1290		// Any branch of an and becoming impossible means the entire clause
1291		// is impossible. Otherwise, just handle rewrites.
1292		for i, child := range mt.children {
1293			newChild, err := pruneMatchTree(child)
1294			if err != nil {
1295				return nil, err
1296			}
1297			if newChild == nil {
1298				return nil, nil
1299			}
1300			mt.children[i] = newChild
1301		}
1302	case *orMatchTree:
1303		// *All* branches of an OR becoming impossible means the entire clause
1304		// is impossible. Otherwise, drop impossible subclauses and handle
1305		// rewrites, including simplifying to a singular resulting child branch.
1306		n := 0
1307		for _, child := range mt.children {
1308			newChild, err := pruneMatchTree(child)
1309			if err != nil {
1310				return nil, err
1311			}
1312			if newChild != nil {
1313				mt.children[n] = newChild
1314				n++
1315			}
1316		}
1317		mt.children = mt.children[:n]
1318		if len(mt.children) == 1 {
1319			return mt.children[0], nil
1320		} else if len(mt.children) == 0 {
1321			return nil, nil
1322		}
1323	case *noVisitMatchTree:
1324		mt.matchTree, err = pruneMatchTree(mt.matchTree)
1325		if err != nil {
1326			return nil, err
1327		}
1328		if mt.matchTree == nil {
1329			return nil, nil
1330		}
1331	case *fileNameMatchTree:
1332		mt.child, err = pruneMatchTree(mt.child)
1333	case *boostMatchTree:
1334		mt.child, err = pruneMatchTree(mt.child)
1335		if err != nil {
1336			return nil, err
1337		}
1338		if mt.child == nil {
1339			return nil, nil
1340		}
1341	case *andLineMatchTree:
1342		child, err := pruneMatchTree(&mt.andMatchTree)
1343		if err != nil {
1344			return nil, err
1345		}
1346		if child == nil {
1347			return nil, nil
1348		}
1349		if c, ok := child.(*andMatchTree); ok {
1350			mt.andMatchTree = *c
1351		} else {
1352			// the and was simplified to a single clause,
1353			// so the linematch portion is irrelevant.
1354			return mt, nil
1355		}
1356	case *notMatchTree:
1357		mt.child, err = pruneMatchTree(mt.child)
1358		if err != nil {
1359			return nil, err
1360		}
1361		if mt.child == nil {
1362			// not false => true
1363			return &bruteForceMatchTree{}, nil
1364		}
1365	// unhandled:
1366	case *docMatchTree:
1367	case *bruteForceMatchTree:
1368	case *regexpMatchTree:
1369	case *wordMatchTree:
1370	}
1371	return mt, err
1372}
Configure Feed

Configure Feed