当前位置: 首页 > news >正文

聊聊Spring AI Alibaba的OneNoteDocumentReader

本文主要研究一下Spring AI Alibaba的OneNoteDocumentReader

OneNoteDocumentReader

community/document-readers/spring-ai-alibaba-starter-document-reader-onenote/src/main/java/com/alibaba/cloud/api/reader/onenote/OneNoteDocumentReader.java

public class OneNoteDocumentReader implements DocumentReader {public static final String MICROSOFT_GRAPH_BASE_URL = "https://graph.microsoft.com/v1.0";public static final String NOTEBOOK_ID_FILTER_PREFIX = "/me/onenote/pages/?$expand=parentNotebook&$filter=parentNotebook/id";public static final String SECTION_ID_FILTER_PREFIX = "/me/onenote/pages/?$expand=parentSection&$filter=parentSection/id";private static final Logger log = LoggerFactory.getLogger(OneNoteDocumentReader.class);private final OneNoteResource oneNoteResource;private final HttpClient client;private final String accessToken;public OneNoteDocumentReader(String accessToken, OneNoteResource oneNoteResource) {this.accessToken = accessToken;this.oneNoteResource = oneNoteResource;this.client = HttpClient.newBuilder().version(HttpClient.Version.HTTP_2).build();}/*** Retrieves the content of a OneNote notebook by querying the Microsoft Graph API.*/private List<String> getNoteBookContent(String accessToken, String notebookId) {// Build the URI for fetching pages from the notebookString uri = MICROSOFT_GRAPH_BASE_URL + NOTEBOOK_ID_FILTER_PREFIX + "+eq+" + "'" + notebookId + "'";// Get the page IDs from the notebook by querying the APIList<String> pageIdsFromNotebook = getOneNotePageIdsByURI(accessToken, uri);// Fetch the content for each page by its IDreturn pageIdsFromNotebook.stream().map(id -> getPageContent(accessToken, id)).toList();}/*** Retrieves the content of a OneNote section by querying the Microsoft Graph API.*/private List<String> getSectionContent(String accessToken, String sectionId) {// Build the URI for fetching pages from the sectionString uri = MICROSOFT_GRAPH_BASE_URL + SECTION_ID_FILTER_PREFIX + "+eq+" + "'" + sectionId + "'";// Get the page IDs from the notebook by querying the APIList<String> pageIdsBySection = getOneNotePageIdsByURI(accessToken, uri);// Fetch the content for each page by its IDreturn pageIdsBySection.stream().map(id -> getPageContent(accessToken, id)).toList();}private List<String> getOneNotePageIdsByURI(String accessToken, String uri) {HttpRequest request = HttpRequest.newBuilder().header("Authorization", accessToken).header("Content-Type", "application/json").uri(URI.create(uri)).GET().build();try {HttpResponse<String> response = this.client.send(request, HttpResponse.BodyHandlers.ofString());Assert.isTrue(response.statusCode() == 200, "Failed to fetch pages information");// Parse JSON response and extract page IDsreturn parsePageIdsFromJson(response.body());}catch (Exception e) {throw new RuntimeException("Failed to get pages id", e);}}/*** Parses the JSON response and extracts page IDs*/private List<String> parsePageIdsFromJson(String jsonResponse) {JsonObject rootObject = JsonParser.parseString(jsonResponse).getAsJsonObject();JsonArray valueArray = rootObject.getAsJsonArray("value");return valueArray.asList().stream().map(jsonElement -> jsonElement.getAsJsonObject().get("id").getAsString()).toList();}/*** Retrieves the content of a specific OneNote page by querying the Microsoft Graph* API.*/private String getPageContent(String accessToken, String pageId) {URI uri = URI.create(MICROSOFT_GRAPH_BASE_URL + "/me/onenote/pages/" + pageId + "/content");HttpRequest request = HttpRequest.newBuilder().header("Authorization", accessToken).uri(uri).GET().build();try {HttpResponse<String> response = this.client.send(request, HttpResponse.BodyHandlers.ofString());Assert.isTrue(response.statusCode() == 200, "Failed to fetch page blocks");return parseHtmlContent(response.body());}catch (Exception e) {log.warn("Failed to get page content with token: {}, pageId: {}, {}", accessToken, pageId, e.getMessage(),e);throw new RuntimeException("Failed to get page content", e);}}@Overridepublic List<Document> get() {// Get the access tokenString accessToken = this.accessToken;// Get the resource type and resource ID for the OneNote resourceOneNoteResource.ResourceType resourceType = this.oneNoteResource.getResourceType();String resourceId = this.oneNoteResource.getResourceId();// Parameters checkAssert.notNull(accessToken, "token must not be null");Assert.notNull(resourceType, "resource type must not be null");Assert.notNull(resourceId, "resource id must not be null");// Fetch content based on the resource type (Notebook, Section, or Page)List<String> content = switch (resourceType) {case NOTEBOOK -> getNoteBookContent(accessToken, resourceId);case SECTION -> getSectionContent(accessToken, resourceId);case PAGE -> Collections.singletonList(getPageContent(accessToken, resourceId));};// Build metadata for the resourceMap<String, Object> metaData = buildMetadata();// Construct a list of Document objectsreturn content.stream().map(c -> new Document(c, metaData)).toList();}private String parseHtmlContent(String htmlContent) {// Parse the HTML contentorg.jsoup.nodes.Document parseDoc = Jsoup.parse(htmlContent);// Get title and text content, ensuring title is not emptyString title = parseDoc.title();String text = parseDoc.text();// Return title and content in a readable formatreturn (StringUtils.hasText(title) ? title : "") + "\n" + text;}/*** Builds metadata for a given OneNote resource (Notebook, Section, or Page) by* querying the Microsoft Graph API.*/private Map<String, Object> buildMetadata() {Map<String, Object> metadata = new HashMap<>();String accessToken = this.accessToken;String resourceId = this.oneNoteResource.getResourceId();OneNoteResource.ResourceType resourceType = this.oneNoteResource.getResourceType();String endpoint = switch (resourceType) {case NOTEBOOK -> "/notebooks/";case SECTION -> "/sections/";case PAGE -> "/pages/";};String uriPath = MICROSOFT_GRAPH_BASE_URL + "/me/onenote" + endpoint + resourceId;URI uri = URI.create(uriPath);// Add basic metadata to the map (resource URI, type, and ID)metadata.put(OneNoteResource.SOURCE, uriPath);metadata.put("resourceType", resourceType.name());metadata.put("resourceId", resourceId);try {HttpRequest request = HttpRequest.newBuilder().header("Authorization", accessToken).header("Content-Type", "application/json").uri(uri).GET().build();HttpResponse<String> response = this.client.send(request, HttpResponse.BodyHandlers.ofString());Assert.isTrue(response.statusCode() == 200, "Failed to fetch page blocks");// Parse the JSON response to extract relevant metadata fieldsJsonObject jsonMetaData = JsonParser.parseString(response.body()).getAsJsonObject();// Extract creation date and add to metadata if availableString createDateTime = Optional.ofNullable(jsonMetaData.get("createdDateTime")).map(JsonElement::getAsString).orElse(null);if (StringUtils.hasText(createDateTime)) {metadata.put("createdTime", Instant.parse(createDateTime).toEpochMilli());}// Extract last modified date and add to metadata if availableString lastModifiedDateTime = Optional.ofNullable(jsonMetaData.get("lastModifiedDateTime")).map(JsonElement::getAsString).orElse(null);if (StringUtils.hasText(lastModifiedDateTime)) {metadata.put("lastModifiedTime", Instant.parse(lastModifiedDateTime).toEpochMilli());}// Extract content URL and add to metadata if availableString contentURL = Optional.ofNullable(jsonMetaData.get("contentUrl")).map(JsonElement::getAsString).orElse(null);if (StringUtils.hasText(contentURL)) {metadata.put("contentURL", contentURL);}}catch (Exception e) {log.warn("Failed to get page content with token: {}, resourceId: {}, resourceType: {}, {}", accessToken,resourceId, resourceType, e.getMessage(), e);throw new RuntimeException("Failed to get page content", e);}return metadata;}}

OneNoteDocumentReader构造器要求输入accessToken及oneNoteResource,它会构建HttpClient,其get方法根据resourceType执行不同的逻辑,NOTEBOOK执行getNoteBookContent,SECTION执行getSectionContent,PAGE执行getPageContent,之后通过buildMetadata构建metaData,最后构建Document返回
getNoteBookContent请求https://graph.microsoft.com/v1.0/me/onenote/pages/?$expand=parentNotebook&$filter=parentNotebook/id+eq+'notebookId'提取pageIds,然后通过getPageContent获取内容
getSectionContent请求https://graph.microsoft.com/v1.0/me/onenote/pages/?$expand=parentSection&$filter=parentSection/id+eq+'sessionId'提取pageIds,然后通过getPageContent获取内容
getPageContent请求https://graph.microsoft.com/v1.0/me/onenote/pages/pageId/content获取html结果,再通过jsoup解析title、text,最后通过\n拼接返回
buildMetadata方法根据不同的resourceType构建不同的请求uri,请求之后提取createdTime、lastModifiedTime、contentURL

OneNoteResource

community/document-readers/spring-ai-alibaba-starter-document-reader-onenote/src/main/java/com/alibaba/cloud/api/reader/onenote/OneNoteResource.java

public class OneNoteResource implements Resource {public static final String SOURCE = "source";public enum ResourceType {NOTEBOOK, SECTION, PAGE}private final ResourceType resourceType;private final String resourceId;public ResourceType getResourceType() {return resourceType;}public String getResourceId() {return resourceId;}public OneNoteResource(String resourceId, ResourceType resourceType) {Assert.hasText(resourceId, "ResourceId must not be empty");Assert.notNull(resourceType, "ResourceType must not be null");this.resourceId = resourceId;this.resourceType = resourceType;}//......
}  

OneNoteResource主要是定义了NOTEBOOK, SECTION, PAGE这三种resourceType以及resourceId

示例

community/document-readers/spring-ai-alibaba-starter-document-reader-onenote/src/test/java/com/alibaba/cloud/api/reader/onenote/OneNoteDocumentReaderTest.java

@EnabledIfEnvironmentVariable(named = "ONENOTE_ACCESS_TOKEN", matches = ".+")
public class OneNoteDocumentReaderTest {private static final String TEST_ACCESS_TOKEN = System.getenv("ONENOTE_ACCESS_TOKEN");private static final String TEST_NOTEBOOK_ID = "${notebookId}";private static final String TEST_SECTION_ID = "${sectionId}";private static final String TEST_PAGE_ID = "${pageId}";private OneNoteDocumentReader oneNoteDocumentReader;static {if (TEST_ACCESS_TOKEN == null || TEST_ACCESS_TOKEN.isEmpty()) {System.out.println("ONENOTE_ACCESS_TOKEN environment variable is not set. Tests will be skipped.");}}@Testpublic void test_load_page() {// Ensure TEST_ACCESS_TOKEN is not null, otherwise skip the testAssumptions.assumeTrue(TEST_ACCESS_TOKEN != null && !TEST_ACCESS_TOKEN.isEmpty(),"Skipping test because ONENOTE_ACCESS_TOKEN is not set");// Create page readerOneNoteResource oneNoteResource = OneNoteResource.builder().resourceId(TEST_PAGE_ID).resourceType(OneNoteResource.ResourceType.PAGE).build();OneNoteDocumentReader oneNoteDocumentReader = new OneNoteDocumentReader(TEST_ACCESS_TOKEN, oneNoteResource);List<Document> documents = oneNoteDocumentReader.get();// thenassertThat(documents).isNotEmpty();Document document = documents.get(0);// Verify metadataassertThat(document.getMetadata()).containsKey(OneNoteResource.SOURCE);assertThat(document.getMetadata().get("resourceType")).isEqualTo(OneNoteResource.ResourceType.PAGE.name());assertThat(document.getMetadata().get("resourceId")).isEqualTo(TEST_PAGE_ID);// Verify contentString content = document.getText();assertThat(content).isNotEmpty();}@Testpublic void test_load_section() {// Ensure TEST_ACCESS_TOKEN is not null, otherwise skip the testAssumptions.assumeTrue(TEST_ACCESS_TOKEN != null && !TEST_ACCESS_TOKEN.isEmpty(),"Skipping test because ONENOTE_ACCESS_TOKEN is not set");// Create page readerOneNoteResource oneNoteResource = OneNoteResource.builder().resourceId(TEST_SECTION_ID).resourceType(OneNoteResource.ResourceType.SECTION).build();OneNoteDocumentReader oneNoteDocumentReader = new OneNoteDocumentReader(TEST_ACCESS_TOKEN, oneNoteResource);List<Document> documents = oneNoteDocumentReader.get();// thenassertThat(documents).isNotEmpty();Document document = documents.get(0);// Verify metadataassertThat(document.getMetadata()).containsKey(OneNoteResource.SOURCE);assertThat(document.getMetadata().get("resourceType")).isEqualTo(OneNoteResource.ResourceType.SECTION.name());assertThat(document.getMetadata().get("resourceId")).isEqualTo(TEST_SECTION_ID);// Verify contentString content = document.getText();assertThat(content).isNotEmpty();}@Testpublic void test_load_notebook() {// Ensure TEST_ACCESS_TOKEN is not null, otherwise skip the testAssumptions.assumeTrue(TEST_ACCESS_TOKEN != null && !TEST_ACCESS_TOKEN.isEmpty(),"Skipping test because ONENOTE_ACCESS_TOKEN is not set");// Create page readerOneNoteResource oneNoteResource = OneNoteResource.builder().resourceId(TEST_NOTEBOOK_ID).resourceType(OneNoteResource.ResourceType.NOTEBOOK).build();OneNoteDocumentReader oneNoteDocumentReader = new OneNoteDocumentReader(TEST_ACCESS_TOKEN, oneNoteResource);List<Document> documents = oneNoteDocumentReader.get();// thenassertThat(documents).isNotEmpty();Document document = documents.get(0);// Verify metadataassertThat(document.getMetadata()).containsKey(OneNoteResource.SOURCE);assertThat(document.getMetadata().get("resourceType")).isEqualTo(OneNoteResource.ResourceType.NOTEBOOK.name());assertThat(document.getMetadata().get("resourceId")).isEqualTo(TEST_NOTEBOOK_ID);// Verify contentString content = document.getText();assertThat(content).isNotEmpty();}}

小结

spring-ai-alibaba-starter-document-reader-onenote提供了OneNoteDocumentReader用于根据accessToken、resourceId、resourceType去获取oneNote的内容及meta。

doc

  • java2ai

相关文章:

  • 实现Variant
  • AI赋能Python长时序植被遥感动态分析、物候提取、时空变异归因及RSEI生态评估
  • 系统高性能设计核心机制图解:缓存优化、链表调度与时间轮原理
  • 白鲸开源WhaleStudio与崖山数据库管理系统YashanDB完成产品兼容互认证
  • 麒麟系统离线安装软件方法(kazam录屏软件为例)
  • SEO的关键词研究与优化 第一章
  • AI | 最近比较火的几个生成式对话 AI
  • YOLO训练时到底需不需要使用权重
  • 【AI提示词】私人教练
  • 昆仑万维开源SkyReels-V2,解锁无限时长电影级创作,总分83.9%登顶V-Bench榜单
  • 使用正确的 JVM 功能加速现有部署
  • Kaamel视角下的MCP安全最佳实践
  • python-69-基于graphviz可视化软件生成流程图
  • 文件操作、流对象示例
  • 用 Python 实现基于 Open CASCADE 的 CAD 绘图工具
  • 碰一碰发视频源码文案功能,支持OEM
  • VulnHub-DC-2靶机渗透教程
  • 编译型语言、解释型语言与混合型语言:原理、区别与应用场景详解
  • 【C++】STL之deque
  • flutter 中各种日志
  • 人大法工委:涉核领域还需要有一部统领性的基础法律
  • 帕力旦·吐尔逊已任喀什大学党委副书记、校长
  • 拍片无小事,牙齿也有故事
  • 印控克什米尔26名游客遭恐袭丧生后,印度对巴宣布多项反制措施
  • 刺激视网膜可让人“看”到全新颜色
  • 体坛联播|AC米兰挺进意大利杯决赛,弗雷戴特宣布退役