聊聊Spring AI Alibaba的OneNoteDocumentReader
序
本文主要研究一下Spring AI Alibaba的OneNoteDocumentReader
OneNoteDocumentReader
community/document-readers/spring-ai-alibaba-starter-document-reader-onenote/src/main/java/com/alibaba/cloud/api/reader/onenote/OneNoteDocumentReader.java
public class OneNoteDocumentReader implements DocumentReader {public static final String MICROSOFT_GRAPH_BASE_URL = "https://graph.microsoft.com/v1.0";public static final String NOTEBOOK_ID_FILTER_PREFIX = "/me/onenote/pages/?$expand=parentNotebook&$filter=parentNotebook/id";public static final String SECTION_ID_FILTER_PREFIX = "/me/onenote/pages/?$expand=parentSection&$filter=parentSection/id";private static final Logger log = LoggerFactory.getLogger(OneNoteDocumentReader.class);private final OneNoteResource oneNoteResource;private final HttpClient client;private final String accessToken;public OneNoteDocumentReader(String accessToken, OneNoteResource oneNoteResource) {this.accessToken = accessToken;this.oneNoteResource = oneNoteResource;this.client = HttpClient.newBuilder().version(HttpClient.Version.HTTP_2).build();}/*** Retrieves the content of a OneNote notebook by querying the Microsoft Graph API.*/private List<String> getNoteBookContent(String accessToken, String notebookId) {// Build the URI for fetching pages from the notebookString uri = MICROSOFT_GRAPH_BASE_URL + NOTEBOOK_ID_FILTER_PREFIX + "+eq+" + "'" + notebookId + "'";// Get the page IDs from the notebook by querying the APIList<String> pageIdsFromNotebook = getOneNotePageIdsByURI(accessToken, uri);// Fetch the content for each page by its IDreturn pageIdsFromNotebook.stream().map(id -> getPageContent(accessToken, id)).toList();}/*** Retrieves the content of a OneNote section by querying the Microsoft Graph API.*/private List<String> getSectionContent(String accessToken, String sectionId) {// Build the URI for fetching pages from the sectionString uri = MICROSOFT_GRAPH_BASE_URL + SECTION_ID_FILTER_PREFIX + "+eq+" + "'" + sectionId + "'";// Get the page IDs from the notebook by querying the APIList<String> pageIdsBySection = getOneNotePageIdsByURI(accessToken, uri);// Fetch the content for each page by its IDreturn pageIdsBySection.stream().map(id -> getPageContent(accessToken, id)).toList();}private List<String> getOneNotePageIdsByURI(String accessToken, String uri) {HttpRequest request = HttpRequest.newBuilder().header("Authorization", accessToken).header("Content-Type", "application/json").uri(URI.create(uri)).GET().build();try {HttpResponse<String> response = this.client.send(request, HttpResponse.BodyHandlers.ofString());Assert.isTrue(response.statusCode() == 200, "Failed to fetch pages information");// Parse JSON response and extract page IDsreturn parsePageIdsFromJson(response.body());}catch (Exception e) {throw new RuntimeException("Failed to get pages id", e);}}/*** Parses the JSON response and extracts page IDs*/private List<String> parsePageIdsFromJson(String jsonResponse) {JsonObject rootObject = JsonParser.parseString(jsonResponse).getAsJsonObject();JsonArray valueArray = rootObject.getAsJsonArray("value");return valueArray.asList().stream().map(jsonElement -> jsonElement.getAsJsonObject().get("id").getAsString()).toList();}/*** Retrieves the content of a specific OneNote page by querying the Microsoft Graph* API.*/private String getPageContent(String accessToken, String pageId) {URI uri = URI.create(MICROSOFT_GRAPH_BASE_URL + "/me/onenote/pages/" + pageId + "/content");HttpRequest request = HttpRequest.newBuilder().header("Authorization", accessToken).uri(uri).GET().build();try {HttpResponse<String> response = this.client.send(request, HttpResponse.BodyHandlers.ofString());Assert.isTrue(response.statusCode() == 200, "Failed to fetch page blocks");return parseHtmlContent(response.body());}catch (Exception e) {log.warn("Failed to get page content with token: {}, pageId: {}, {}", accessToken, pageId, e.getMessage(),e);throw new RuntimeException("Failed to get page content", e);}}@Overridepublic List<Document> get() {// Get the access tokenString accessToken = this.accessToken;// Get the resource type and resource ID for the OneNote resourceOneNoteResource.ResourceType resourceType = this.oneNoteResource.getResourceType();String resourceId = this.oneNoteResource.getResourceId();// Parameters checkAssert.notNull(accessToken, "token must not be null");Assert.notNull(resourceType, "resource type must not be null");Assert.notNull(resourceId, "resource id must not be null");// Fetch content based on the resource type (Notebook, Section, or Page)List<String> content = switch (resourceType) {case NOTEBOOK -> getNoteBookContent(accessToken, resourceId);case SECTION -> getSectionContent(accessToken, resourceId);case PAGE -> Collections.singletonList(getPageContent(accessToken, resourceId));};// Build metadata for the resourceMap<String, Object> metaData = buildMetadata();// Construct a list of Document objectsreturn content.stream().map(c -> new Document(c, metaData)).toList();}private String parseHtmlContent(String htmlContent) {// Parse the HTML contentorg.jsoup.nodes.Document parseDoc = Jsoup.parse(htmlContent);// Get title and text content, ensuring title is not emptyString title = parseDoc.title();String text = parseDoc.text();// Return title and content in a readable formatreturn (StringUtils.hasText(title) ? title : "") + "\n" + text;}/*** Builds metadata for a given OneNote resource (Notebook, Section, or Page) by* querying the Microsoft Graph API.*/private Map<String, Object> buildMetadata() {Map<String, Object> metadata = new HashMap<>();String accessToken = this.accessToken;String resourceId = this.oneNoteResource.getResourceId();OneNoteResource.ResourceType resourceType = this.oneNoteResource.getResourceType();String endpoint = switch (resourceType) {case NOTEBOOK -> "/notebooks/";case SECTION -> "/sections/";case PAGE -> "/pages/";};String uriPath = MICROSOFT_GRAPH_BASE_URL + "/me/onenote" + endpoint + resourceId;URI uri = URI.create(uriPath);// Add basic metadata to the map (resource URI, type, and ID)metadata.put(OneNoteResource.SOURCE, uriPath);metadata.put("resourceType", resourceType.name());metadata.put("resourceId", resourceId);try {HttpRequest request = HttpRequest.newBuilder().header("Authorization", accessToken).header("Content-Type", "application/json").uri(uri).GET().build();HttpResponse<String> response = this.client.send(request, HttpResponse.BodyHandlers.ofString());Assert.isTrue(response.statusCode() == 200, "Failed to fetch page blocks");// Parse the JSON response to extract relevant metadata fieldsJsonObject jsonMetaData = JsonParser.parseString(response.body()).getAsJsonObject();// Extract creation date and add to metadata if availableString createDateTime = Optional.ofNullable(jsonMetaData.get("createdDateTime")).map(JsonElement::getAsString).orElse(null);if (StringUtils.hasText(createDateTime)) {metadata.put("createdTime", Instant.parse(createDateTime).toEpochMilli());}// Extract last modified date and add to metadata if availableString lastModifiedDateTime = Optional.ofNullable(jsonMetaData.get("lastModifiedDateTime")).map(JsonElement::getAsString).orElse(null);if (StringUtils.hasText(lastModifiedDateTime)) {metadata.put("lastModifiedTime", Instant.parse(lastModifiedDateTime).toEpochMilli());}// Extract content URL and add to metadata if availableString contentURL = Optional.ofNullable(jsonMetaData.get("contentUrl")).map(JsonElement::getAsString).orElse(null);if (StringUtils.hasText(contentURL)) {metadata.put("contentURL", contentURL);}}catch (Exception e) {log.warn("Failed to get page content with token: {}, resourceId: {}, resourceType: {}, {}", accessToken,resourceId, resourceType, e.getMessage(), e);throw new RuntimeException("Failed to get page content", e);}return metadata;}}
OneNoteDocumentReader构造器要求输入accessToken及oneNoteResource,它会构建HttpClient,其get方法根据resourceType执行不同的逻辑,NOTEBOOK执行getNoteBookContent,SECTION执行getSectionContent,PAGE执行getPageContent,之后通过buildMetadata构建metaData,最后构建Document返回
getNoteBookContent请求https://graph.microsoft.com/v1.0/me/onenote/pages/?$expand=parentNotebook&$filter=parentNotebook/id+eq+'notebookId'
提取pageIds,然后通过getPageContent获取内容
getSectionContent请求https://graph.microsoft.com/v1.0/me/onenote/pages/?$expand=parentSection&$filter=parentSection/id+eq+'sessionId'
提取pageIds,然后通过getPageContent获取内容
getPageContent请求https://graph.microsoft.com/v1.0/me/onenote/pages/pageId/content
获取html结果,再通过jsoup解析title、text,最后通过\n
拼接返回
buildMetadata方法根据不同的resourceType构建不同的请求uri,请求之后提取createdTime、lastModifiedTime、contentURL
OneNoteResource
community/document-readers/spring-ai-alibaba-starter-document-reader-onenote/src/main/java/com/alibaba/cloud/api/reader/onenote/OneNoteResource.java
public class OneNoteResource implements Resource {public static final String SOURCE = "source";public enum ResourceType {NOTEBOOK, SECTION, PAGE}private final ResourceType resourceType;private final String resourceId;public ResourceType getResourceType() {return resourceType;}public String getResourceId() {return resourceId;}public OneNoteResource(String resourceId, ResourceType resourceType) {Assert.hasText(resourceId, "ResourceId must not be empty");Assert.notNull(resourceType, "ResourceType must not be null");this.resourceId = resourceId;this.resourceType = resourceType;}//......
}
OneNoteResource主要是定义了NOTEBOOK, SECTION, PAGE这三种resourceType以及resourceId
示例
community/document-readers/spring-ai-alibaba-starter-document-reader-onenote/src/test/java/com/alibaba/cloud/api/reader/onenote/OneNoteDocumentReaderTest.java
@EnabledIfEnvironmentVariable(named = "ONENOTE_ACCESS_TOKEN", matches = ".+")
public class OneNoteDocumentReaderTest {private static final String TEST_ACCESS_TOKEN = System.getenv("ONENOTE_ACCESS_TOKEN");private static final String TEST_NOTEBOOK_ID = "${notebookId}";private static final String TEST_SECTION_ID = "${sectionId}";private static final String TEST_PAGE_ID = "${pageId}";private OneNoteDocumentReader oneNoteDocumentReader;static {if (TEST_ACCESS_TOKEN == null || TEST_ACCESS_TOKEN.isEmpty()) {System.out.println("ONENOTE_ACCESS_TOKEN environment variable is not set. Tests will be skipped.");}}@Testpublic void test_load_page() {// Ensure TEST_ACCESS_TOKEN is not null, otherwise skip the testAssumptions.assumeTrue(TEST_ACCESS_TOKEN != null && !TEST_ACCESS_TOKEN.isEmpty(),"Skipping test because ONENOTE_ACCESS_TOKEN is not set");// Create page readerOneNoteResource oneNoteResource = OneNoteResource.builder().resourceId(TEST_PAGE_ID).resourceType(OneNoteResource.ResourceType.PAGE).build();OneNoteDocumentReader oneNoteDocumentReader = new OneNoteDocumentReader(TEST_ACCESS_TOKEN, oneNoteResource);List<Document> documents = oneNoteDocumentReader.get();// thenassertThat(documents).isNotEmpty();Document document = documents.get(0);// Verify metadataassertThat(document.getMetadata()).containsKey(OneNoteResource.SOURCE);assertThat(document.getMetadata().get("resourceType")).isEqualTo(OneNoteResource.ResourceType.PAGE.name());assertThat(document.getMetadata().get("resourceId")).isEqualTo(TEST_PAGE_ID);// Verify contentString content = document.getText();assertThat(content).isNotEmpty();}@Testpublic void test_load_section() {// Ensure TEST_ACCESS_TOKEN is not null, otherwise skip the testAssumptions.assumeTrue(TEST_ACCESS_TOKEN != null && !TEST_ACCESS_TOKEN.isEmpty(),"Skipping test because ONENOTE_ACCESS_TOKEN is not set");// Create page readerOneNoteResource oneNoteResource = OneNoteResource.builder().resourceId(TEST_SECTION_ID).resourceType(OneNoteResource.ResourceType.SECTION).build();OneNoteDocumentReader oneNoteDocumentReader = new OneNoteDocumentReader(TEST_ACCESS_TOKEN, oneNoteResource);List<Document> documents = oneNoteDocumentReader.get();// thenassertThat(documents).isNotEmpty();Document document = documents.get(0);// Verify metadataassertThat(document.getMetadata()).containsKey(OneNoteResource.SOURCE);assertThat(document.getMetadata().get("resourceType")).isEqualTo(OneNoteResource.ResourceType.SECTION.name());assertThat(document.getMetadata().get("resourceId")).isEqualTo(TEST_SECTION_ID);// Verify contentString content = document.getText();assertThat(content).isNotEmpty();}@Testpublic void test_load_notebook() {// Ensure TEST_ACCESS_TOKEN is not null, otherwise skip the testAssumptions.assumeTrue(TEST_ACCESS_TOKEN != null && !TEST_ACCESS_TOKEN.isEmpty(),"Skipping test because ONENOTE_ACCESS_TOKEN is not set");// Create page readerOneNoteResource oneNoteResource = OneNoteResource.builder().resourceId(TEST_NOTEBOOK_ID).resourceType(OneNoteResource.ResourceType.NOTEBOOK).build();OneNoteDocumentReader oneNoteDocumentReader = new OneNoteDocumentReader(TEST_ACCESS_TOKEN, oneNoteResource);List<Document> documents = oneNoteDocumentReader.get();// thenassertThat(documents).isNotEmpty();Document document = documents.get(0);// Verify metadataassertThat(document.getMetadata()).containsKey(OneNoteResource.SOURCE);assertThat(document.getMetadata().get("resourceType")).isEqualTo(OneNoteResource.ResourceType.NOTEBOOK.name());assertThat(document.getMetadata().get("resourceId")).isEqualTo(TEST_NOTEBOOK_ID);// Verify contentString content = document.getText();assertThat(content).isNotEmpty();}}
小结
spring-ai-alibaba-starter-document-reader-onenote提供了OneNoteDocumentReader用于根据accessToken、resourceId、resourceType去获取oneNote的内容及meta。
doc
- java2ai