Skip to main content

gruel_lsp/
position.rs

1//! Position conversion between LSP Position (UTF-16 code units) and Gruel
2//! byte offsets (ADR-0091).
3//!
4//! LSP defaults to UTF-16; we also support UTF-8 (LSP 3.17 `positionEncoding`)
5//! to skip the conversion entirely for capable clients.
6
7use gruel_util::span::Span;
8use lsp_types::{Position, Range};
9
10/// Position encoding negotiated with the client.
11#[derive(Debug, Clone, Copy, PartialEq, Eq)]
12pub enum PositionEncoding {
13    Utf8,
14    Utf16,
15}
16
17impl Default for PositionEncoding {
18    fn default() -> Self {
19        PositionEncoding::Utf16
20    }
21}
22
23/// Cached line start byte offsets for a source string.
24///
25/// `line_starts[i]` is the byte offset of the start of line `i` (0-indexed
26/// for LSP). Always begins with `0`.
27#[derive(Debug, Clone)]
28pub struct LineMap {
29    line_starts: Vec<u32>,
30    source_len: u32,
31}
32
33impl LineMap {
34    pub fn new(source: &str) -> Self {
35        let mut line_starts = vec![0u32];
36        for (i, byte) in source.bytes().enumerate() {
37            if byte == b'\n' {
38                line_starts.push((i + 1) as u32);
39            }
40        }
41        Self {
42            line_starts,
43            source_len: source.len() as u32,
44        }
45    }
46
47    /// Number of lines (always at least 1).
48    pub fn line_count(&self) -> u32 {
49        self.line_starts.len() as u32
50    }
51
52    /// Get the 0-based line index containing `byte`.
53    pub fn line_for_byte(&self, byte: u32) -> u32 {
54        let byte = byte.min(self.source_len);
55        // Largest index where line_starts[i] <= byte.
56        let pp = self.line_starts.partition_point(|&s| s <= byte);
57        pp.saturating_sub(1) as u32
58    }
59
60    /// Byte offset where the given 0-based line starts. Returns
61    /// `source_len` if `line` is past the end.
62    pub fn line_start(&self, line: u32) -> u32 {
63        let idx = line as usize;
64        if idx >= self.line_starts.len() {
65            self.source_len
66        } else {
67            self.line_starts[idx]
68        }
69    }
70
71    /// Byte offset just after the last character of the given 0-based line,
72    /// excluding the trailing newline (if any).
73    pub fn line_end(&self, source: &str, line: u32) -> u32 {
74        let next = line.saturating_add(1) as usize;
75        let bytes = source.as_bytes();
76        if next >= self.line_starts.len() {
77            return self.source_len;
78        }
79        let next_start = self.line_starts[next];
80        if next_start > 0 && bytes.get((next_start - 1) as usize) == Some(&b'\n') {
81            next_start - 1
82        } else {
83            next_start
84        }
85    }
86}
87
88/// Convert a byte offset within `source` to an LSP `Position`.
89pub fn byte_to_position(
90    line_map: &LineMap,
91    source: &str,
92    byte: u32,
93    encoding: PositionEncoding,
94) -> Position {
95    let byte = byte.min(source.len() as u32);
96    let line = line_map.line_for_byte(byte);
97    let line_start = line_map.line_start(line) as usize;
98    let prefix = &source[line_start..byte as usize];
99    let character = match encoding {
100        PositionEncoding::Utf8 => prefix.len() as u32,
101        PositionEncoding::Utf16 => prefix.encode_utf16().count() as u32,
102    };
103    Position { line, character }
104}
105
106/// Convert an LSP `Position` to a byte offset within `source`.
107pub fn position_to_byte(
108    line_map: &LineMap,
109    source: &str,
110    pos: Position,
111    encoding: PositionEncoding,
112) -> u32 {
113    let line_start = line_map.line_start(pos.line) as usize;
114    let line_end = line_map.line_end(source, pos.line) as usize;
115    let line_text = &source[line_start..line_end];
116    let column_bytes = match encoding {
117        PositionEncoding::Utf8 => (pos.character as usize).min(line_text.len()),
118        PositionEncoding::Utf16 => {
119            let mut utf16_count = 0u32;
120            let mut byte_off = 0usize;
121            for c in line_text.chars() {
122                if utf16_count >= pos.character {
123                    break;
124                }
125                let unit_len = c.len_utf16() as u32;
126                utf16_count += unit_len;
127                byte_off += c.len_utf8();
128            }
129            byte_off.min(line_text.len())
130        }
131    };
132    (line_start + column_bytes) as u32
133}
134
135/// Convert a Gruel `Span` (within `source`) to an LSP `Range`.
136pub fn span_to_range(
137    line_map: &LineMap,
138    source: &str,
139    span: Span,
140    encoding: PositionEncoding,
141) -> Range {
142    Range {
143        start: byte_to_position(line_map, source, span.start, encoding),
144        end: byte_to_position(line_map, source, span.end, encoding),
145    }
146}
147
148#[cfg(test)]
149mod tests {
150    use super::*;
151
152    #[test]
153    fn byte_to_position_utf8() {
154        let s = "hello\nworld";
155        let li = LineMap::new(s);
156        assert_eq!(
157            byte_to_position(&li, s, 0, PositionEncoding::Utf8),
158            Position {
159                line: 0,
160                character: 0
161            }
162        );
163        assert_eq!(
164            byte_to_position(&li, s, 5, PositionEncoding::Utf8),
165            Position {
166                line: 0,
167                character: 5
168            }
169        );
170        assert_eq!(
171            byte_to_position(&li, s, 6, PositionEncoding::Utf8),
172            Position {
173                line: 1,
174                character: 0
175            }
176        );
177        assert_eq!(
178            byte_to_position(&li, s, 11, PositionEncoding::Utf8),
179            Position {
180                line: 1,
181                character: 5
182            }
183        );
184    }
185
186    #[test]
187    fn position_to_byte_utf8_roundtrip() {
188        let s = "foo\nbar\nbaz";
189        let li = LineMap::new(s);
190        for (line, ch, expected) in [
191            (0u32, 0u32, 0u32),
192            (0, 3, 3),
193            (1, 0, 4),
194            (1, 2, 6),
195            (2, 3, 11),
196        ] {
197            let pos = Position {
198                line,
199                character: ch,
200            };
201            assert_eq!(
202                position_to_byte(&li, s, pos, PositionEncoding::Utf8),
203                expected
204            );
205        }
206    }
207
208    #[test]
209    fn utf16_handles_surrogate_pairs() {
210        // 🦀 is one Unicode scalar (4 UTF-8 bytes, 2 UTF-16 code units).
211        let s = "ab🦀c";
212        let li = LineMap::new(s);
213        let pos_a = byte_to_position(&li, s, 0, PositionEncoding::Utf16);
214        let pos_b = byte_to_position(&li, s, 1, PositionEncoding::Utf16);
215        let pos_crab = byte_to_position(&li, s, 2, PositionEncoding::Utf16);
216        let pos_after_crab = byte_to_position(&li, s, 6, PositionEncoding::Utf16);
217        assert_eq!(pos_a.character, 0);
218        assert_eq!(pos_b.character, 1);
219        assert_eq!(pos_crab.character, 2);
220        assert_eq!(pos_after_crab.character, 4);
221
222        // Round-trip
223        assert_eq!(position_to_byte(&li, s, pos_a, PositionEncoding::Utf16), 0);
224        assert_eq!(
225            position_to_byte(&li, s, pos_after_crab, PositionEncoding::Utf16),
226            6
227        );
228    }
229
230    #[test]
231    fn span_to_range_basic() {
232        let s = "let x = 42;";
233        let li = LineMap::new(s);
234        let span = Span::with_file(gruel_util::span::FileId::DEFAULT, 4, 5);
235        let range = span_to_range(&li, s, span, PositionEncoding::Utf8);
236        assert_eq!(
237            range.start,
238            Position {
239                line: 0,
240                character: 4
241            }
242        );
243        assert_eq!(
244            range.end,
245            Position {
246                line: 0,
247                character: 5
248            }
249        );
250    }
251
252    #[test]
253    fn line_map_empty_source() {
254        let li = LineMap::new("");
255        assert_eq!(li.line_count(), 1);
256        assert_eq!(li.line_for_byte(0), 0);
257        assert_eq!(li.line_start(0), 0);
258    }
259}